Skip to content

Commit aed6528

Browse files
committed
Use a local variable such that a register is used for the column before writing it to memory
1 parent 2f1fe32 commit aed6528

File tree

2 files changed

+14
-10
lines changed

2 files changed

+14
-10
lines changed

ext/dom/lexbor/lexbor/html/tokenizer.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,24 +315,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
315315
tkz->last = end;
316316

317317
while (data < end) {
318+
size_t current_column = tkz->current_column;
318319
const lxb_char_t *new_data = tkz->state(tkz, data, end);
319320
while (data < new_data) {
320321
/* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
321322
if (*data == '\n') {
322323
tkz->current_line++;
323-
tkz->current_column = 0;
324+
current_column = 0;
324325
} else {
325326
/* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
326327
* Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
327328
if ((*data & 0b11000000) == 0b10000000) {
328329
/* Continuation byte, do nothing */
329330
} else {
330331
/* First byte for a codepoint */
331-
tkz->current_column++;
332+
current_column++;
332333
}
333334
}
334335
data++;
335336
}
337+
tkz->current_column = current_column;
336338
}
337339

338340
return tkz->status;

ext/dom/lexbor/patches/0001-Expose-line-and-column-information-for-use-in-PHP.patch

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
1-
From faee2893e499bdcaa3a511bcff197366b8a87968 Mon Sep 17 00:00:00 2001
1+
From 9d60c0fda0b51e9374a234c48df36130d2c988ee Mon Sep 17 00:00:00 2001
22
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
33
Date: Sat, 26 Aug 2023 15:08:59 +0200
44
Subject: [PATCH] Expose line and column information for use in PHP
55

66
---
77
source/lexbor/dom/interfaces/node.h | 2 ++
88
source/lexbor/html/token.h | 2 ++
9-
source/lexbor/html/tokenizer.c | 22 +++++++++++++++++++++-
9+
source/lexbor/html/tokenizer.c | 24 +++++++++++++++++++++++-
1010
source/lexbor/html/tokenizer.h | 2 ++
1111
source/lexbor/html/tokenizer/state.h | 2 ++
1212
source/lexbor/html/tree.c | 11 +++++++++++
1313
source/lexbor/html/tree/error.c | 5 +++--
1414
source/lexbor/html/tree/error.h | 5 +++--
15-
8 files changed, 46 insertions(+), 5 deletions(-)
15+
8 files changed, 48 insertions(+), 5 deletions(-)
1616

1717
diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h
1818
index 4a10197..ff9c924 100755
@@ -41,7 +41,7 @@ index 79accd0..0b7f4fd 100755
4141
const lxb_char_t *text_start;
4242
const lxb_char_t *text_end;
4343
diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c
44-
index 741bced..a399758 100755
44+
index 741bced..0bd9aec 100755
4545
--- a/source/lexbor/html/tokenizer.c
4646
+++ b/source/lexbor/html/tokenizer.c
4747
@@ -91,6 +91,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
@@ -61,29 +61,31 @@ index 741bced..a399758 100755
6161

6262
return LXB_STATUS_OK;
6363
}
64-
@@ -312,7 +315,24 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
64+
@@ -312,7 +315,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
6565
tkz->last = end;
6666

6767
while (data < end) {
6868
- data = tkz->state(tkz, data, end);
69+
+ size_t current_column = tkz->current_column;
6970
+ const lxb_char_t *new_data = tkz->state(tkz, data, end);
7071
+ while (data < new_data) {
7172
+ /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
7273
+ if (*data == '\n') {
7374
+ tkz->current_line++;
74-
+ tkz->current_column = 0;
75+
+ current_column = 0;
7576
+ } else {
7677
+ /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
7778
+ * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
7879
+ if ((*data & 0b11000000) == 0b10000000) {
7980
+ /* Continuation byte, do nothing */
8081
+ } else {
8182
+ /* First byte for a codepoint */
82-
+ tkz->current_column++;
83+
+ current_column++;
8384
+ }
8485
+ }
8586
+ data++;
8687
+ }
88+
+ tkz->current_column = current_column;
8789
}
8890

8991
return tkz->status;
@@ -182,5 +184,5 @@ index 2fd06cb..ed1859f 100755
182184
lxb_html_tree_error_t;
183185

184186
--
185-
2.41.0
187+
2.43.0
186188

0 commit comments

Comments
 (0)