1
- From faee2893e499bdcaa3a511bcff197366b8a87968 Mon Sep 17 00:00:00 2001
1
+ From 9d60c0fda0b51e9374a234c48df36130d2c988ee Mon Sep 17 00:00:00 2001
2
2
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
3
3
Date: Sat, 26 Aug 2023 15:08:59 +0200
4
4
Subject: [PATCH] Expose line and column information for use in PHP
5
5
6
6
---
7
7
source/lexbor/dom/interfaces/node.h | 2 ++
8
8
source/lexbor/html/token.h | 2 ++
9
- source/lexbor/html/tokenizer.c | 22 +++++++++++++++++++++-
9
+ source/lexbor/html/tokenizer.c | 24 ++ +++++++++++++++++++++-
10
10
source/lexbor/html/tokenizer.h | 2 ++
11
11
source/lexbor/html/tokenizer/state.h | 2 ++
12
12
source/lexbor/html/tree.c | 11 +++++++++++
13
13
source/lexbor/html/tree/error.c | 5 +++--
14
14
source/lexbor/html/tree/error.h | 5 +++--
15
- 8 files changed, 46 insertions(+), 5 deletions(-)
15
+ 8 files changed, 48 insertions(+), 5 deletions(-)
16
16
17
17
diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h
18
18
index 4a10197..ff9c924 100755
@@ -41,7 +41,7 @@ index 79accd0..0b7f4fd 100755
41
41
const lxb_char_t *text_start;
42
42
const lxb_char_t *text_end;
43
43
diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c
44
- index 741bced..a399758 100755
44
+ index 741bced..0bd9aec 100755
45
45
--- a/source/lexbor/html/tokenizer.c
46
46
+++ b/source/lexbor/html/tokenizer.c
47
47
@@ -91,6 +91,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
@@ -61,29 +61,31 @@ index 741bced..a399758 100755
61
61
62
62
return LXB_STATUS_OK;
63
63
}
64
- @@ -312,7 +315,24 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
64
+ @@ -312,7 +315,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
65
65
tkz->last = end;
66
66
67
67
while (data < end) {
68
68
- data = tkz->state(tkz, data, end);
69
+ + size_t current_column = tkz->current_column;
69
70
+ const lxb_char_t *new_data = tkz->state(tkz, data, end);
70
71
+ while (data < new_data) {
71
72
+ /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
72
73
+ if (*data == '\n') {
73
74
+ tkz->current_line++;
74
- + tkz-> current_column = 0;
75
+ + current_column = 0;
75
76
+ } else {
76
77
+ /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
77
78
+ * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
78
79
+ if ((*data & 0b11000000) == 0b10000000) {
79
80
+ /* Continuation byte, do nothing */
80
81
+ } else {
81
82
+ /* First byte for a codepoint */
82
- + tkz-> current_column++;
83
+ + current_column++;
83
84
+ }
84
85
+ }
85
86
+ data++;
86
87
+ }
88
+ + tkz->current_column = current_column;
87
89
}
88
90
89
91
return tkz->status;
@@ -182,5 +184,5 @@ index 2fd06cb..ed1859f 100755
182
184
lxb_html_tree_error_t;
183
185
184
186
- -
185
- 2.41 .0
187
+ 2.43 .0
186
188
0 commit comments