Skip to content

Commit 0ff0ee3

Browse files
committed
Implement iteration cache, item cache and length cache for node list iteration
The current implementation follows the spec requirement that the list must be "live". This means that changes in the document must be reflected in the existing node lists without requiring the user to refetch the node list. The consequence is that getting any item, or the length of the list, always starts searching from the root element of the node list. This results in O(n) time to get any item or the length. If there's a for loop over the node list, this means the iterations will take O(n²) time in total. This causes real-world performance issues with potential for downtime (see GH-11308 and its references for details). We fix this by introducing a caching strategy. We cache the last iterated object in the iterator, the last requested item in the node list, and the last length computation. To invalidate the cache, we simply count the number of modifications made to the containing document. If the modification number does not match what the number was during caching, we know the document has been modified and the cache is invalid. If this ever overflows, we saturate the modification number and don't do any caching anymore. Note that we don't check for overflow on 64-bit systems because it would take hundreds of years to overflow. Fixes GH-11308.
1 parent 91fd564 commit 0ff0ee3

15 files changed

+394
-21
lines changed

UPGRADING.INTERNALS

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,12 @@ PHP 8.3 INTERNALS UPGRADE NOTES
120120
- A new function dom_get_doc_props_read_only() is added to gather the document
121121
properties in a read-only way. This function avoids allocation when there are
122122
no document properties changed yet.
123+
- The node list returned by DOMNode::getElementsByTagName() and
124+
DOMNode::getElementsByTagNameNS() now caches the length and the last requested item.
125+
This means that the length and the last requested item are not recalculated
126+
when the node list is iterated over multiple times.
127+
If you do not use the internal PHP dom APIs to modify the document, you need to
128+
manually invalidate the cache using php_dom_invalidate_node_list_cache().
123129

124130
========================
125131
4. OpCode changes

ext/dom/document.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -847,6 +847,8 @@ PHP_METHOD(DOMDocument, importNode)
847847
}
848848
}
849849

850+
php_dom_invalidate_node_list_cache(docp);
851+
850852
DOM_RET_OBJ((xmlNodePtr) retnodep, &ret, intern);
851853
}
852854
/* }}} end dom_document_import_node */
@@ -1070,6 +1072,8 @@ PHP_METHOD(DOMDocument, normalizeDocument)
10701072

10711073
DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
10721074

1075+
php_dom_invalidate_node_list_cache(docp);
1076+
10731077
dom_normalize((xmlNodePtr) docp);
10741078
}
10751079
/* }}} end dom_document_normalize_document */
@@ -1328,10 +1332,14 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) {
13281332

13291333
if (id != NULL) {
13301334
intern = Z_DOMOBJ_P(id);
1335+
size_t old_modification_nr = 0;
13311336
if (intern != NULL) {
13321337
docp = (xmlDocPtr) dom_object_get_node(intern);
13331338
doc_prop = NULL;
13341339
if (docp != NULL) {
1340+
const php_libxml_doc_ptr *doc_ptr = docp->_private;
1341+
ZEND_ASSERT(doc_ptr != NULL); /* Must exist, we have a document */
1342+
old_modification_nr = doc_ptr->cache_tag.modification_nr;
13351343
php_libxml_decrement_node_ptr((php_libxml_node_object *) intern);
13361344
doc_prop = intern->document->doc_props;
13371345
intern->document->doc_props = NULL;
@@ -1348,6 +1356,12 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) {
13481356
}
13491357

13501358
php_libxml_increment_node_ptr((php_libxml_node_object *)intern, (xmlNodePtr)newdoc, (void *)intern);
1359+
/* Since iterators should invalidate, we need to start the modification number from the old counter */
1360+
if (old_modification_nr != 0) {
1361+
php_libxml_doc_ptr* doc_ptr = (php_libxml_doc_ptr*) ((php_libxml_node_object*) intern)->node; /* downcast */
1362+
doc_ptr->cache_tag.modification_nr = old_modification_nr;
1363+
php_libxml_invalidate_node_list_cache(doc_ptr);
1364+
}
13511365

13521366
RETURN_TRUE;
13531367
} else {
@@ -1563,6 +1577,8 @@ PHP_METHOD(DOMDocument, xinclude)
15631577
php_dom_remove_xinclude_nodes(root);
15641578
}
15651579

1580+
php_dom_invalidate_node_list_cache(docp);
1581+
15661582
if (err) {
15671583
RETVAL_LONG(err);
15681584
} else {
@@ -1871,10 +1887,14 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */
18711887

18721888
if (id != NULL && instanceof_function(Z_OBJCE_P(id), dom_document_class_entry)) {
18731889
intern = Z_DOMOBJ_P(id);
1890+
size_t old_modification_nr = 0;
18741891
if (intern != NULL) {
18751892
docp = (xmlDocPtr) dom_object_get_node(intern);
18761893
doc_prop = NULL;
18771894
if (docp != NULL) {
1895+
const php_libxml_doc_ptr *doc_ptr = docp->_private;
1896+
ZEND_ASSERT(doc_ptr != NULL); /* Must exist, we have a document */
1897+
old_modification_nr = doc_ptr->cache_tag.modification_nr;
18781898
php_libxml_decrement_node_ptr((php_libxml_node_object *) intern);
18791899
doc_prop = intern->document->doc_props;
18801900
intern->document->doc_props = NULL;
@@ -1891,6 +1911,12 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */
18911911
}
18921912

18931913
php_libxml_increment_node_ptr((php_libxml_node_object *)intern, (xmlNodePtr)newdoc, (void *)intern);
1914+
/* Since iterators should invalidate, we need to start the modification number from the old counter */
1915+
if (old_modification_nr != 0) {
1916+
php_libxml_doc_ptr* doc_ptr = (php_libxml_doc_ptr*) ((php_libxml_node_object*) intern)->node; /* downcast */
1917+
doc_ptr->cache_tag.modification_nr = old_modification_nr;
1918+
php_libxml_invalidate_node_list_cache(doc_ptr);
1919+
}
18941920

18951921
RETURN_TRUE;
18961922
} else {

ext/dom/dom_iterators.c

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ static void php_dom_iterator_move_forward(zend_object_iterator *iter) /* {{{ */
179179
dom_object *intern;
180180
dom_object *nnmap;
181181
dom_nnodemap_object *objmap;
182-
int previndex=0;
182+
int previndex;
183183
HashTable *nodeht;
184184
zval *entry;
185185
bool do_curobj_undef = 1;
@@ -205,20 +205,27 @@ static void php_dom_iterator_move_forward(zend_object_iterator *iter) /* {{{ */
205205
do_curobj_undef = 0;
206206
}
207207
} else {
208-
curnode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
209208
if (objmap->nodetype == XML_ATTRIBUTE_NODE ||
210209
objmap->nodetype == XML_ELEMENT_NODE) {
210+
curnode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
211211
curnode = curnode->next;
212212
} else {
213213
/* Nav the tree evey time as this is LIVE */
214214
basenode = dom_object_get_node(objmap->baseobj);
215-
if (basenode && (basenode->type == XML_DOCUMENT_NODE ||
216-
basenode->type == XML_HTML_DOCUMENT_NODE)) {
217-
basenode = xmlDocGetRootElement((xmlDoc *) basenode);
218-
} else if (basenode) {
219-
basenode = basenode->children;
215+
if (php_dom_is_cache_tag_stale_from_node(&iterator->cache_tag, basenode)) {
216+
if (basenode && (basenode->type == XML_DOCUMENT_NODE ||
217+
basenode->type == XML_HTML_DOCUMENT_NODE)) {
218+
basenode = xmlDocGetRootElement((xmlDoc *) basenode);
219+
} else if (basenode) {
220+
basenode = basenode->children;
221+
} else {
222+
goto err;
223+
}
224+
php_dom_mark_cache_tag_up_to_date_from_node(&iterator->cache_tag, basenode);
225+
previndex = 0;
220226
} else {
221-
goto err;
227+
previndex = iter->index - 1;
228+
basenode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
222229
}
223230
curnode = dom_get_elements_by_tag_name_ns_raw(
224231
basenode, (char *) objmap->ns, (char *) objmap->local, &previndex, iter->index);
@@ -270,6 +277,7 @@ zend_object_iterator *php_dom_get_iterator(zend_class_entry *ce, zval *object, i
270277
}
271278
iterator = emalloc(sizeof(php_dom_iterator));
272279
zend_iterator_init(&iterator->intern);
280+
iterator->cache_tag.modification_nr = 0;
273281

274282
ZVAL_OBJ_COPY(&iterator->intern.data, Z_OBJ_P(object));
275283
iterator->intern.funcs = &php_dom_iterator_funcs;

ext/dom/node.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,8 @@ int dom_node_text_content_write(dom_object *obj, zval *newval)
769769
return FAILURE;
770770
}
771771

772+
php_dom_invalidate_node_list_cache(nodep->doc);
773+
772774
const xmlChar *xmlChars = (const xmlChar *) ZSTR_VAL(str);
773775
int type = nodep->type;
774776

@@ -897,6 +899,8 @@ PHP_METHOD(DOMNode, insertBefore)
897899
php_libxml_increment_doc_ref((php_libxml_node_object *)childobj, NULL);
898900
}
899901

902+
php_dom_invalidate_node_list_cache(parentp->doc);
903+
900904
if (ref != NULL) {
901905
DOM_GET_OBJ(refp, ref, xmlNodePtr, refpobj);
902906
if (refp->parent != parentp) {
@@ -1086,6 +1090,7 @@ PHP_METHOD(DOMNode, replaceChild)
10861090
nodep->doc->intSubset = (xmlDtd *) newchild;
10871091
}
10881092
}
1093+
php_dom_invalidate_node_list_cache(nodep->doc);
10891094
DOM_RET_OBJ(oldchild, &ret, intern);
10901095
}
10911096
/* }}} end dom_node_replace_child */
@@ -1127,6 +1132,7 @@ PHP_METHOD(DOMNode, removeChild)
11271132
}
11281133

11291134
xmlUnlinkNode(child);
1135+
php_dom_invalidate_node_list_cache(nodep->doc);
11301136
DOM_RET_OBJ(child, &ret, intern);
11311137
}
11321138
/* }}} end dom_node_remove_child */
@@ -1230,6 +1236,8 @@ PHP_METHOD(DOMNode, appendChild)
12301236

12311237
dom_reconcile_ns(nodep->doc, new_child);
12321238

1239+
php_dom_invalidate_node_list_cache(nodep->doc);
1240+
12331241
DOM_RET_OBJ(new_child, &ret, intern);
12341242
}
12351243
/* }}} end dom_node_append_child */
@@ -1339,6 +1347,8 @@ PHP_METHOD(DOMNode, normalize)
13391347

13401348
DOM_GET_OBJ(nodep, id, xmlNodePtr, intern);
13411349

1350+
php_dom_invalidate_node_list_cache(nodep->doc);
1351+
13421352
dom_normalize(nodep);
13431353

13441354
}
@@ -1571,6 +1581,8 @@ static void dom_canonicalization(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{
15711581
RETURN_THROWS();
15721582
}
15731583

1584+
php_dom_invalidate_node_list_cache(docp);
1585+
15741586
if (xpath_array == NULL) {
15751587
if (nodep->type != XML_DOCUMENT_NODE) {
15761588
ctxp = xmlXPathNewContext(docp);

ext/dom/nodelist.c

Lines changed: 81 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,23 @@
3131
* Since:
3232
*/
3333

34+
static zend_always_inline void objmap_cache_release_cached_obj(dom_nnodemap_object *objmap)
35+
{
36+
if (objmap->cached_obj) {
37+
if (GC_DELREF(&objmap->cached_obj->std) == 0) {
38+
zend_objects_store_del(&objmap->cached_obj->std);
39+
}
40+
objmap->cached_obj = NULL;
41+
objmap->cached_obj_index = 0;
42+
}
43+
}
44+
45+
static zend_always_inline void reset_objmap_cache(dom_nnodemap_object *objmap)
46+
{
47+
objmap_cache_release_cached_obj(objmap);
48+
objmap->cached_length = -1;
49+
}
50+
3451
static int get_nodelist_length(dom_object *obj)
3552
{
3653
dom_nnodemap_object *objmap = (dom_nnodemap_object *) obj->ptr;
@@ -52,6 +69,17 @@ static int get_nodelist_length(dom_object *obj)
5269
return 0;
5370
}
5471

72+
if (!php_dom_is_cache_tag_stale_from_node(&objmap->cache_tag, nodep)) {
73+
if (objmap->cached_length >= 0) {
74+
return objmap->cached_length;
75+
}
76+
/* Only the length is out-of-date, the cache tag is still valid.
77+
* Therefore, only overwrite the length and keep the currently cached object. */
78+
} else {
79+
php_dom_mark_cache_tag_up_to_date_from_node(&objmap->cache_tag, nodep);
80+
reset_objmap_cache(objmap);
81+
}
82+
5583
int count = 0;
5684
if (objmap->nodetype == XML_ATTRIBUTE_NODE || objmap->nodetype == XML_ELEMENT_NODE) {
5785
xmlNodePtr curnode = nodep->children;
@@ -72,6 +100,8 @@ static int get_nodelist_length(dom_object *obj)
72100
nodep, (char *) objmap->ns, (char *) objmap->local, &count, -1);
73101
}
74102

103+
objmap->cached_length = count;
104+
75105
return count;
76106
}
77107

@@ -113,11 +143,12 @@ PHP_METHOD(DOMNodeList, item)
113143
zval *id;
114144
zend_long index;
115145
int ret;
146+
bool cache_itemnode = false;
116147
dom_object *intern;
117148
xmlNodePtr itemnode = NULL;
118149

119150
dom_nnodemap_object *objmap;
120-
xmlNodePtr nodep, curnode;
151+
xmlNodePtr nodep;
121152
int count = 0;
122153

123154
id = ZEND_THIS;
@@ -147,28 +178,68 @@ PHP_METHOD(DOMNodeList, item)
147178
} else if (objmap->baseobj) {
148179
nodep = dom_object_get_node(objmap->baseobj);
149180
if (nodep) {
181+
/* For now we're only able to use cache for forward search.
182+
* TODO: in the future we could extend the logic of the node list such that backwards searches
183+
* are also possible. */
184+
bool restart = true;
185+
int relative_index = index;
186+
if (index >= objmap->cached_obj_index && objmap->cached_obj && !php_dom_is_cache_tag_stale_from_node(&objmap->cache_tag, nodep)) {
187+
nodep = dom_object_get_node(objmap->cached_obj);
188+
/* The node cannot be NULL if the cache is valid. If it is NULL, then it means we
189+
* forgot an invalidation somewhere. Take the defensive programming approach and invalidate
190+
* it here if it's NULL (except in debug mode where we would want to catch this). */
191+
if (UNEXPECTED(nodep == NULL)) {
192+
#if ZEND_DEBUG
193+
ZEND_UNREACHABLE();
194+
#endif
195+
reset_objmap_cache(objmap);
196+
} else {
197+
restart = false;
198+
relative_index -= objmap->cached_obj_index;
199+
}
200+
}
150201
if (objmap->nodetype == XML_ATTRIBUTE_NODE || objmap->nodetype == XML_ELEMENT_NODE) {
151-
curnode = nodep->children;
152-
while (count < index && curnode != NULL) {
202+
if (restart) {
203+
nodep = nodep->children;
204+
}
205+
while (count < relative_index && nodep != NULL) {
153206
count++;
154-
curnode = curnode->next;
207+
nodep = nodep->next;
155208
}
156-
itemnode = curnode;
209+
itemnode = nodep;
157210
} else {
158-
if (nodep->type == XML_DOCUMENT_NODE || nodep->type == XML_HTML_DOCUMENT_NODE) {
159-
nodep = xmlDocGetRootElement((xmlDoc *) nodep);
160-
} else {
161-
nodep = nodep->children;
211+
if (restart) {
212+
if (nodep->type == XML_DOCUMENT_NODE || nodep->type == XML_HTML_DOCUMENT_NODE) {
213+
nodep = xmlDocGetRootElement((xmlDoc*) nodep);
214+
} else {
215+
nodep = nodep->children;
216+
}
162217
}
163-
itemnode = dom_get_elements_by_tag_name_ns_raw(nodep, (char *) objmap->ns, (char *) objmap->local, &count, index);
218+
itemnode = dom_get_elements_by_tag_name_ns_raw(nodep, (char *) objmap->ns, (char *) objmap->local, &count, relative_index);
164219
}
220+
cache_itemnode = true;
165221
}
166222
}
167223
}
168224
}
169225

170226
if (itemnode) {
171227
DOM_RET_OBJ(itemnode, &ret, objmap->baseobj);
228+
if (cache_itemnode) {
229+
/* Hold additional reference for the cache, must happen before releasing the cache
230+
* because we might be the last reference holder. */
231+
dom_object *cached_obj = Z_DOMOBJ_P(return_value);
232+
GC_ADDREF(&cached_obj->std);
233+
/* If the tag is stale, all cached data is useless. Otherwise only the cached object is useless. */
234+
if (php_dom_is_cache_tag_stale_from_node(&objmap->cache_tag, nodep)) {
235+
php_dom_mark_cache_tag_up_to_date_from_node(&objmap->cache_tag, nodep);
236+
reset_objmap_cache(objmap);
237+
} else {
238+
objmap_cache_release_cached_obj(objmap);
239+
}
240+
objmap->cached_obj_index = index;
241+
objmap->cached_obj = cached_obj;
242+
}
172243
return;
173244
}
174245
}

ext/dom/parentnode.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,8 @@ void dom_parent_node_append(dom_object *context, zval *nodes, int nodesc)
280280
return;
281281
}
282282

283+
php_dom_invalidate_node_list_cache(parentNode->doc);
284+
283285
xmlNode *fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
284286

285287
if (fragment == NULL) {
@@ -322,6 +324,8 @@ void dom_parent_node_prepend(dom_object *context, zval *nodes, int nodesc)
322324
return;
323325
}
324326

327+
php_dom_invalidate_node_list_cache(parentNode->doc);
328+
325329
xmlNodePtr newchild, nextsib;
326330
xmlNode *fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
327331

@@ -402,6 +406,8 @@ void dom_parent_node_after(dom_object *context, zval *nodes, int nodesc)
402406

403407
doc = prevsib->doc;
404408

409+
php_dom_invalidate_node_list_cache(doc);
410+
405411
/* Spec step 4: convert nodes into fragment */
406412
fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
407413

@@ -451,6 +457,8 @@ void dom_parent_node_before(dom_object *context, zval *nodes, int nodesc)
451457

452458
doc = nextsib->doc;
453459

460+
php_dom_invalidate_node_list_cache(doc);
461+
454462
/* Spec step 4: convert nodes into fragment */
455463
fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
456464

@@ -506,6 +514,8 @@ void dom_child_node_remove(dom_object *context)
506514
return;
507515
}
508516

517+
php_dom_invalidate_node_list_cache(context->document->ptr);
518+
509519
while (children) {
510520
if (children == child) {
511521
xmlUnlinkNode(child);

0 commit comments

Comments
 (0)