Skip to content

Commit a906481

Browse files
authored
Optimizations for HTML 5 loading (#12896)
* Fix inverted NULL and add dictionary * Avoid useless error processing if no reporting is set * Avoid double work while processing attributes and use fast text instantiation
1 parent 90eb567 commit a906481

File tree

2 files changed

+81
-47
lines changed

2 files changed

+81
-47
lines changed

ext/dom/html5_parser.c

Lines changed: 77 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,26 @@ static const xmlChar *get_libxml_namespace_href(uintptr_t lexbor_namespace)
7474
}
7575
}
7676

77+
static xmlNodePtr lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc, const lxb_char_t *data, size_t data_length, bool compact_text_nodes)
78+
{
79+
if (compact_text_nodes && data_length < LXML_INTERNED_STRINGS_SIZE) {
80+
/* See xmlSAX2TextNode() in libxml2 */
81+
xmlNodePtr lxml_text = xmlMalloc(sizeof(*lxml_text));
82+
if (UNEXPECTED(lxml_text == NULL)) {
83+
return NULL;
84+
}
85+
memset(lxml_text, 0, sizeof(*lxml_text));
86+
lxml_text->name = xmlStringText;
87+
lxml_text->type = XML_TEXT_NODE;
88+
lxml_text->doc = lxml_doc;
89+
lxml_text->content = (xmlChar *) &lxml_text->properties;
90+
memcpy(lxml_text->content, data, data_length);
91+
return lxml_text;
92+
} else {
93+
return xmlNewDocTextLen(lxml_doc, (const xmlChar *) data, data_length);
94+
}
95+
}
96+
7797
static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
7898
lxb_dom_node_t *start_node,
7999
xmlDocPtr lxml_doc,
@@ -130,14 +150,52 @@ static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
130150
);
131151
}
132152

133-
for (lxb_dom_attr_t *attr = element->last_attr; attr != NULL; attr = attr->prev) {
134-
lexbor_libxml2_bridge_work_list_item_push(
135-
&work_list,
136-
(lxb_dom_node_t *) attr,
137-
entering_namespace,
138-
lxml_element,
139-
current_lxml_ns
140-
);
153+
xmlAttrPtr last_added_attr = NULL;
154+
for (lxb_dom_attr_t *attr = element->first_attr; attr != NULL; attr = attr->next) {
155+
/* Same namespace remark as for elements */
156+
size_t local_name_length, value_length;
157+
const lxb_char_t *local_name = lxb_dom_attr_local_name(attr, &local_name_length);
158+
const lxb_char_t *value = lxb_dom_attr_value(attr, &value_length);
159+
160+
if (UNEXPECTED(local_name_length >= INT_MAX || value_length >= INT_MAX)) {
161+
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
162+
goto out;
163+
}
164+
165+
xmlAttrPtr lxml_attr = xmlMalloc(sizeof(xmlAttr));
166+
if (UNEXPECTED(lxml_attr == NULL)) {
167+
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
168+
goto out;
169+
}
170+
171+
memset(lxml_attr, 0, sizeof(xmlAttr));
172+
lxml_attr->type = XML_ATTRIBUTE_NODE;
173+
lxml_attr->parent = lxml_element;
174+
lxml_attr->name = xmlDictLookup(lxml_doc->dict, local_name, local_name_length);
175+
lxml_attr->doc = lxml_doc;
176+
xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, value, value_length, true /* Always true for optimization purposes */);
177+
if (UNEXPECTED(lxml_text == NULL)) {
178+
xmlFreeProp(lxml_attr);
179+
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
180+
goto out;
181+
}
182+
183+
lxml_attr->children = lxml_attr->last = lxml_text;
184+
185+
if (last_added_attr == NULL) {
186+
lxml_element->properties = lxml_attr;
187+
} else {
188+
last_added_attr->next = lxml_attr;
189+
lxml_attr->prev = last_added_attr;
190+
}
191+
last_added_attr = lxml_attr;
192+
193+
/* xmlIsID does some other stuff too that is irrelevant here. */
194+
if (local_name_length == 2 && local_name[0] == 'i' && local_name[1] == 'd') {
195+
xmlAddID(NULL, lxml_doc, value, lxml_attr);
196+
}
197+
198+
/* libxml2 doesn't support line numbers on this anyway, it derives them instead, so don't bother */
141199
}
142200
} else if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
143201
lxb_dom_text_t *text = lxb_dom_interface_text(node);
@@ -147,26 +205,10 @@ static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
147205
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW;
148206
goto out;
149207
}
150-
xmlNodePtr lxml_text;
151-
if (compact_text_nodes && data_length < LXML_INTERNED_STRINGS_SIZE) {
152-
/* See xmlSAX2TextNode() in libxml2 */
153-
lxml_text = xmlMalloc(sizeof(*lxml_text));
154-
if (UNEXPECTED(lxml_text == NULL)) {
155-
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
156-
goto out;
157-
}
158-
memset(lxml_text, 0, sizeof(*lxml_text));
159-
lxml_text->name = xmlStringText;
160-
lxml_text->type = XML_TEXT_NODE;
161-
lxml_text->doc = lxml_doc;
162-
lxml_text->content = (xmlChar *) &lxml_text->properties;
163-
memcpy(lxml_text->content, data, data_length + 1 /* include '\0' */);
164-
} else {
165-
lxml_text = xmlNewDocTextLen(lxml_doc, (const xmlChar *) data, data_length);
166-
if (UNEXPECTED(lxml_text == NULL)) {
167-
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
168-
goto out;
169-
}
208+
xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc, data, data_length, compact_text_nodes);
209+
if (UNEXPECTED(lxml_text == NULL)) {
210+
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
211+
goto out;
170212
}
171213
xmlAddChild(lxml_parent, lxml_text);
172214
if (node->line >= USHRT_MAX) {
@@ -192,20 +234,6 @@ static lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert(
192234
goto out;
193235
}
194236
/* libxml2 doesn't support line numbers on this anyway, it returns -1 instead, so don't bother */
195-
} else if (node->type == LXB_DOM_NODE_TYPE_ATTRIBUTE) {
196-
lxb_dom_attr_t *attr = lxb_dom_interface_attr(node);
197-
do {
198-
/* Same namespace remark as for elements */
199-
const lxb_char_t *local_name = lxb_dom_attr_local_name(attr, NULL);
200-
const lxb_char_t *value = lxb_dom_attr_value(attr, NULL);
201-
xmlAttrPtr lxml_attr = xmlSetNsProp(lxml_parent, NULL, local_name, value);
202-
if (UNEXPECTED(lxml_attr == NULL)) {
203-
retval = LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
204-
goto out;
205-
}
206-
attr = attr->next;
207-
/* libxml2 doesn't support line numbers on this anyway, it derives them instead, so don't bother */
208-
} while (attr);
209237
} else if (node->type == LXB_DOM_NODE_TYPE_COMMENT) {
210238
lxb_dom_comment_t *comment = lxb_dom_interface_comment(node);
211239
xmlNodePtr lxml_comment = xmlNewDocComment(lxml_doc, comment->char_data.data.data);
@@ -247,15 +275,19 @@ lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(
247275
{
248276
#ifdef LIBXML_HTML_ENABLED
249277
xmlDocPtr lxml_doc = htmlNewDocNoDtD(NULL, NULL);
278+
if (UNEXPECTED(!lxml_doc)) {
279+
return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
280+
}
250281
#else
251282
/* If HTML support is not enabled, then htmlNewDocNoDtD() is not available.
252283
* This code mimics the behaviour. */
253284
xmlDocPtr lxml_doc = xmlNewDoc((const xmlChar *) "1.0");
254-
lxml_doc->type = XML_HTML_DOCUMENT_NODE;
255-
#endif
256-
if (!lxml_doc) {
285+
if (UNEXPECTED(!lxml_doc)) {
257286
return LEXBOR_LIBXML2_BRIDGE_STATUS_OOM;
258287
}
288+
lxml_doc->type = XML_HTML_DOCUMENT_NODE;
289+
#endif
290+
lxml_doc->dict = xmlDictCreate();
259291
lexbor_libxml2_bridge_status status = lexbor_libxml2_bridge_convert(
260292
lxb_dom_interface_node(document)->last_child,
261293
lxml_doc,

ext/dom/html_document.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -487,8 +487,10 @@ static bool dom_process_parse_chunk(
487487
if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
488488
return false;
489489
}
490-
lexbor_libxml2_bridge_report_errors(ctx, parser, encoding_output, application_data->current_total_offset, tokenizer_error_offset, tree_error_offset);
491-
dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, application_data->current_total_offset + input_buffer_length);
490+
if (ctx->tokenizer_error_reporter || ctx->tree_error_reporter) {
491+
lexbor_libxml2_bridge_report_errors(ctx, parser, encoding_output, application_data->current_total_offset, tokenizer_error_offset, tree_error_offset);
492+
dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, application_data->current_total_offset + input_buffer_length);
493+
}
492494
application_data->current_total_offset += input_buffer_length;
493495
application_data->cache_tokenizer.last_offset = 0;
494496
return true;

0 commit comments

Comments
 (0)