|
| 1 | +From faee2893e499bdcaa3a511bcff197366b8a87968 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> |
| 3 | +Date: Sat, 26 Aug 2023 15:08:59 +0200 |
| 4 | +Subject: [PATCH] Expose line and column information for use in PHP |
| 5 | + |
| 6 | +--- |
| 7 | + source/lexbor/dom/interfaces/node.h | 2 ++ |
| 8 | + source/lexbor/html/token.h | 2 ++ |
| 9 | + source/lexbor/html/tokenizer.c | 22 +++++++++++++++++++++- |
| 10 | + source/lexbor/html/tokenizer.h | 2 ++ |
| 11 | + source/lexbor/html/tokenizer/state.h | 2 ++ |
| 12 | + source/lexbor/html/tree.c | 11 +++++++++++ |
| 13 | + source/lexbor/html/tree/error.c | 5 +++-- |
| 14 | + source/lexbor/html/tree/error.h | 5 +++-- |
| 15 | + 8 files changed, 46 insertions(+), 5 deletions(-) |
| 16 | + |
| 17 | +diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h |
| 18 | +index 4a10197..ff9c924 100755 |
| 19 | +--- a/source/lexbor/dom/interfaces/node.h |
| 20 | ++++ b/source/lexbor/dom/interfaces/node.h |
| 21 | +@@ -58,6 +58,8 @@ struct lxb_dom_node { |
| 22 | + |
| 23 | + lxb_dom_node_type_t type; |
| 24 | + |
| 25 | ++ size_t line; |
| 26 | ++ |
| 27 | + #ifdef LXB_DOM_NODE_USER_VARIABLES |
| 28 | + LXB_DOM_NODE_USER_VARIABLES |
| 29 | + #endif /* LXB_DOM_NODE_USER_VARIABLES */ |
| 30 | +diff --git a/source/lexbor/html/token.h b/source/lexbor/html/token.h |
| 31 | +index 79accd0..0b7f4fd 100755 |
| 32 | +--- a/source/lexbor/html/token.h |
| 33 | ++++ b/source/lexbor/html/token.h |
| 34 | +@@ -33,6 +33,8 @@ enum lxb_html_token_type { |
| 35 | + typedef struct { |
| 36 | + const lxb_char_t *begin; |
| 37 | + const lxb_char_t *end; |
| 38 | ++ size_t line; |
| 39 | ++ size_t column; |
| 40 | + |
| 41 | + const lxb_char_t *text_start; |
| 42 | + const lxb_char_t *text_end; |
| 43 | +diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c |
| 44 | +index 741bced..a399758 100755 |
| 45 | +--- a/source/lexbor/html/tokenizer.c |
| 46 | ++++ b/source/lexbor/html/tokenizer.c |
| 47 | +@@ -91,6 +91,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz) |
| 48 | + |
| 49 | + tkz->pos = tkz->start; |
| 50 | + tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE; |
| 51 | ++ /* current_line & current_column already initialized by calloc (zero-based) */ |
| 52 | + |
| 53 | + tkz->tree = NULL; |
| 54 | + tkz->tags = NULL; |
| 55 | +@@ -152,6 +153,8 @@ lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to, |
| 56 | + tkz_to->start = tkz_from->start; |
| 57 | + tkz_to->end = tkz_from->end; |
| 58 | + tkz_to->pos = tkz_to->start; |
| 59 | ++ tkz_to->current_line = tkz_from->current_line; |
| 60 | ++ tkz_to->current_column = tkz_from->current_column; |
| 61 | + |
| 62 | + return LXB_STATUS_OK; |
| 63 | + } |
| 64 | +@@ -312,7 +315,24 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, |
| 65 | + tkz->last = end; |
| 66 | + |
| 67 | + while (data < end) { |
| 68 | +- data = tkz->state(tkz, data, end); |
| 69 | ++ const lxb_char_t *new_data = tkz->state(tkz, data, end); |
| 70 | ++ while (data < new_data) { |
| 71 | ++ /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */ |
| 72 | ++ if (*data == '\n') { |
| 73 | ++ tkz->current_line++; |
| 74 | ++ tkz->current_column = 0; |
| 75 | ++ } else { |
| 76 | ++ /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code. |
| 77 | ++ * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */ |
| 78 | ++ if ((*data & 0b11000000) == 0b10000000) { |
| 79 | ++ /* Continuation byte, do nothing */ |
| 80 | ++ } else { |
| 81 | ++ /* First byte for a codepoint */ |
| 82 | ++ tkz->current_column++; |
| 83 | ++ } |
| 84 | ++ } |
| 85 | ++ data++; |
| 86 | ++ } |
| 87 | + } |
| 88 | + |
| 89 | + return tkz->status; |
| 90 | +diff --git a/source/lexbor/html/tokenizer.h b/source/lexbor/html/tokenizer.h |
| 91 | +index ba9602f..74bb55e 100755 |
| 92 | +--- a/source/lexbor/html/tokenizer.h |
| 93 | ++++ b/source/lexbor/html/tokenizer.h |
| 94 | +@@ -73,6 +73,8 @@ struct lxb_html_tokenizer { |
| 95 | + const lxb_char_t *end; |
| 96 | + const lxb_char_t *begin; |
| 97 | + const lxb_char_t *last; |
| 98 | ++ size_t current_line; |
| 99 | ++ size_t current_column; |
| 100 | + |
| 101 | + /* Entities */ |
| 102 | + const lexbor_sbst_entry_static_t *entity; |
| 103 | +diff --git a/source/lexbor/html/tokenizer/state.h b/source/lexbor/html/tokenizer/state.h |
| 104 | +index 0892846..77b86ac 100755 |
| 105 | +--- a/source/lexbor/html/tokenizer/state.h |
| 106 | ++++ b/source/lexbor/html/tokenizer/state.h |
| 107 | +@@ -90,6 +90,8 @@ extern "C" { |
| 108 | + do { \ |
| 109 | + tkz->pos = tkz->start; \ |
| 110 | + tkz->token->begin = v_begin; \ |
| 111 | ++ tkz->token->line = tkz->current_line; \ |
| 112 | ++ tkz->token->column = tkz->current_column; \ |
| 113 | + } \ |
| 114 | + while (0) |
| 115 | + |
| 116 | +diff --git a/source/lexbor/html/tree.c b/source/lexbor/html/tree.c |
| 117 | +index 0f067e4..bdec6a5 100755 |
| 118 | +--- a/source/lexbor/html/tree.c |
| 119 | ++++ b/source/lexbor/html/tree.c |
| 120 | +@@ -434,6 +434,9 @@ lxb_html_tree_create_element_for_token(lxb_html_tree_t *tree, |
| 121 | + return NULL; |
| 122 | + } |
| 123 | + |
| 124 | ++ node->line = token->line; |
| 125 | ++ /* We only expose line number in PHP DOM */ |
| 126 | ++ |
| 127 | + lxb_status_t status; |
| 128 | + lxb_dom_element_t *element = lxb_dom_interface_element(node); |
| 129 | + |
| 130 | +@@ -770,6 +773,11 @@ lxb_html_tree_insert_character_for_data(lxb_html_tree_t *tree, |
| 131 | + |
| 132 | + lxb_dom_interface_text(text)->char_data.data = *str; |
| 133 | + |
| 134 | ++ if (tree->tkz_ref) { |
| 135 | ++ text->line = tree->tkz_ref->token->line; |
| 136 | ++ /* We only expose line number in PHP DOM */ |
| 137 | ++ } |
| 138 | ++ |
| 139 | + if (ret_node != NULL) { |
| 140 | + *ret_node = text; |
| 141 | + } |
| 142 | +@@ -809,6 +817,9 @@ lxb_html_tree_insert_comment(lxb_html_tree_t *tree, |
| 143 | + return NULL; |
| 144 | + } |
| 145 | + |
| 146 | ++ node->line = token->line; |
| 147 | ++ /* We only expose line number in PHP DOM */ |
| 148 | ++ |
| 149 | + tree->status = lxb_html_token_make_text(token, &comment->char_data.data, |
| 150 | + tree->document->dom_document.text); |
| 151 | + if (tree->status != LXB_STATUS_OK) { |
| 152 | +diff --git a/source/lexbor/html/tree/error.c b/source/lexbor/html/tree/error.c |
| 153 | +index e6e43f4..88ad8c4 100755 |
| 154 | +--- a/source/lexbor/html/tree/error.c |
| 155 | ++++ b/source/lexbor/html/tree/error.c |
| 156 | +@@ -21,8 +21,9 @@ lxb_html_tree_error_add(lexbor_array_obj_t *parse_errors, |
| 157 | + } |
| 158 | + |
| 159 | + entry->id = id; |
| 160 | +- entry->begin = token->begin; |
| 161 | +- entry->end = token->end; |
| 162 | ++ entry->line = token->line; |
| 163 | ++ entry->column = token->column; |
| 164 | ++ entry->length = token->end - token->begin; |
| 165 | + |
| 166 | + return entry; |
| 167 | + } |
| 168 | +diff --git a/source/lexbor/html/tree/error.h b/source/lexbor/html/tree/error.h |
| 169 | +index 2fd06cb..ed1859f 100755 |
| 170 | +--- a/source/lexbor/html/tree/error.h |
| 171 | ++++ b/source/lexbor/html/tree/error.h |
| 172 | +@@ -97,8 +97,9 @@ lxb_html_tree_error_id_t; |
| 173 | + |
| 174 | + typedef struct { |
| 175 | + lxb_html_tree_error_id_t id; |
| 176 | +- const lxb_char_t *begin; |
| 177 | +- const lxb_char_t *end; |
| 178 | ++ size_t line; |
| 179 | ++ size_t column; |
| 180 | ++ size_t length; |
| 181 | + } |
| 182 | + lxb_html_tree_error_t; |
| 183 | + |
| 184 | +-- |
| 185 | +2.41.0 |
| 186 | + |
0 commit comments