Skip to content

Commit 767fa98

Browse files
committed
Add (already-applied) Lexbor patches
1 parent 78ec17e commit 767fa98

5 files changed

+735
-0
lines changed
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
From faee2893e499bdcaa3a511bcff197366b8a87968 Mon Sep 17 00:00:00 2001
2+
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
3+
Date: Sat, 26 Aug 2023 15:08:59 +0200
4+
Subject: [PATCH] Expose line and column information for use in PHP
5+
6+
---
7+
source/lexbor/dom/interfaces/node.h | 2 ++
8+
source/lexbor/html/token.h | 2 ++
9+
source/lexbor/html/tokenizer.c | 22 +++++++++++++++++++++-
10+
source/lexbor/html/tokenizer.h | 2 ++
11+
source/lexbor/html/tokenizer/state.h | 2 ++
12+
source/lexbor/html/tree.c | 11 +++++++++++
13+
source/lexbor/html/tree/error.c | 5 +++--
14+
source/lexbor/html/tree/error.h | 5 +++--
15+
8 files changed, 46 insertions(+), 5 deletions(-)
16+
17+
diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h
18+
index 4a10197..ff9c924 100755
19+
--- a/source/lexbor/dom/interfaces/node.h
20+
+++ b/source/lexbor/dom/interfaces/node.h
21+
@@ -58,6 +58,8 @@ struct lxb_dom_node {
22+
23+
lxb_dom_node_type_t type;
24+
25+
+ size_t line;
26+
+
27+
#ifdef LXB_DOM_NODE_USER_VARIABLES
28+
LXB_DOM_NODE_USER_VARIABLES
29+
#endif /* LXB_DOM_NODE_USER_VARIABLES */
30+
diff --git a/source/lexbor/html/token.h b/source/lexbor/html/token.h
31+
index 79accd0..0b7f4fd 100755
32+
--- a/source/lexbor/html/token.h
33+
+++ b/source/lexbor/html/token.h
34+
@@ -33,6 +33,8 @@ enum lxb_html_token_type {
35+
typedef struct {
36+
const lxb_char_t *begin;
37+
const lxb_char_t *end;
38+
+ size_t line;
39+
+ size_t column;
40+
41+
const lxb_char_t *text_start;
42+
const lxb_char_t *text_end;
43+
diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c
44+
index 741bced..a399758 100755
45+
--- a/source/lexbor/html/tokenizer.c
46+
+++ b/source/lexbor/html/tokenizer.c
47+
@@ -91,6 +91,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
48+
49+
tkz->pos = tkz->start;
50+
tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE;
51+
+ /* current_line & current_column already initialized by calloc (zero-based) */
52+
53+
tkz->tree = NULL;
54+
tkz->tags = NULL;
55+
@@ -152,6 +153,8 @@ lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to,
56+
tkz_to->start = tkz_from->start;
57+
tkz_to->end = tkz_from->end;
58+
tkz_to->pos = tkz_to->start;
59+
+ tkz_to->current_line = tkz_from->current_line;
60+
+ tkz_to->current_column = tkz_from->current_column;
61+
62+
return LXB_STATUS_OK;
63+
}
64+
@@ -312,7 +315,24 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
65+
tkz->last = end;
66+
67+
while (data < end) {
68+
- data = tkz->state(tkz, data, end);
69+
+ const lxb_char_t *new_data = tkz->state(tkz, data, end);
70+
+ while (data < new_data) {
71+
+ /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
72+
+ if (*data == '\n') {
73+
+ tkz->current_line++;
74+
+ tkz->current_column = 0;
75+
+ } else {
76+
+ /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
77+
+ * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
78+
+ if ((*data & 0b11000000) == 0b10000000) {
79+
+ /* Continuation byte, do nothing */
80+
+ } else {
81+
+ /* First byte for a codepoint */
82+
+ tkz->current_column++;
83+
+ }
84+
+ }
85+
+ data++;
86+
+ }
87+
}
88+
89+
return tkz->status;
90+
diff --git a/source/lexbor/html/tokenizer.h b/source/lexbor/html/tokenizer.h
91+
index ba9602f..74bb55e 100755
92+
--- a/source/lexbor/html/tokenizer.h
93+
+++ b/source/lexbor/html/tokenizer.h
94+
@@ -73,6 +73,8 @@ struct lxb_html_tokenizer {
95+
const lxb_char_t *end;
96+
const lxb_char_t *begin;
97+
const lxb_char_t *last;
98+
+ size_t current_line;
99+
+ size_t current_column;
100+
101+
/* Entities */
102+
const lexbor_sbst_entry_static_t *entity;
103+
diff --git a/source/lexbor/html/tokenizer/state.h b/source/lexbor/html/tokenizer/state.h
104+
index 0892846..77b86ac 100755
105+
--- a/source/lexbor/html/tokenizer/state.h
106+
+++ b/source/lexbor/html/tokenizer/state.h
107+
@@ -90,6 +90,8 @@ extern "C" {
108+
do { \
109+
tkz->pos = tkz->start; \
110+
tkz->token->begin = v_begin; \
111+
+ tkz->token->line = tkz->current_line; \
112+
+ tkz->token->column = tkz->current_column; \
113+
} \
114+
while (0)
115+
116+
diff --git a/source/lexbor/html/tree.c b/source/lexbor/html/tree.c
117+
index 0f067e4..bdec6a5 100755
118+
--- a/source/lexbor/html/tree.c
119+
+++ b/source/lexbor/html/tree.c
120+
@@ -434,6 +434,9 @@ lxb_html_tree_create_element_for_token(lxb_html_tree_t *tree,
121+
return NULL;
122+
}
123+
124+
+ node->line = token->line;
125+
+ /* We only expose line number in PHP DOM */
126+
+
127+
lxb_status_t status;
128+
lxb_dom_element_t *element = lxb_dom_interface_element(node);
129+
130+
@@ -770,6 +773,11 @@ lxb_html_tree_insert_character_for_data(lxb_html_tree_t *tree,
131+
132+
lxb_dom_interface_text(text)->char_data.data = *str;
133+
134+
+ if (tree->tkz_ref) {
135+
+ text->line = tree->tkz_ref->token->line;
136+
+ /* We only expose line number in PHP DOM */
137+
+ }
138+
+
139+
if (ret_node != NULL) {
140+
*ret_node = text;
141+
}
142+
@@ -809,6 +817,9 @@ lxb_html_tree_insert_comment(lxb_html_tree_t *tree,
143+
return NULL;
144+
}
145+
146+
+ node->line = token->line;
147+
+ /* We only expose line number in PHP DOM */
148+
+
149+
tree->status = lxb_html_token_make_text(token, &comment->char_data.data,
150+
tree->document->dom_document.text);
151+
if (tree->status != LXB_STATUS_OK) {
152+
diff --git a/source/lexbor/html/tree/error.c b/source/lexbor/html/tree/error.c
153+
index e6e43f4..88ad8c4 100755
154+
--- a/source/lexbor/html/tree/error.c
155+
+++ b/source/lexbor/html/tree/error.c
156+
@@ -21,8 +21,9 @@ lxb_html_tree_error_add(lexbor_array_obj_t *parse_errors,
157+
}
158+
159+
entry->id = id;
160+
- entry->begin = token->begin;
161+
- entry->end = token->end;
162+
+ entry->line = token->line;
163+
+ entry->column = token->column;
164+
+ entry->length = token->end - token->begin;
165+
166+
return entry;
167+
}
168+
diff --git a/source/lexbor/html/tree/error.h b/source/lexbor/html/tree/error.h
169+
index 2fd06cb..ed1859f 100755
170+
--- a/source/lexbor/html/tree/error.h
171+
+++ b/source/lexbor/html/tree/error.h
172+
@@ -97,8 +97,9 @@ lxb_html_tree_error_id_t;
173+
174+
typedef struct {
175+
lxb_html_tree_error_id_t id;
176+
- const lxb_char_t *begin;
177+
- const lxb_char_t *end;
178+
+ size_t line;
179+
+ size_t column;
180+
+ size_t length;
181+
}
182+
lxb_html_tree_error_t;
183+
184+
--
185+
2.41.0
186+

0 commit comments

Comments
 (0)