Optimize raw HTML post-processor (#1510)

pawamoy · web-flow · commit 1caf02892487 · 2025-03-27T08:44:01.000-04:00
Don't precompute placeholder replacements in raw HTML post-processor. Fixes #1507. Previously, the raw HTML post-processor would precompute all possible replacements for placeholders in a string, based on the HTML stash. It would then apply a regular expression substitution using these replacements. Finally, if the text changed, it would recurse, and do all that again. This was inefficient because placeholders were re-computed each time it recursed, and because only a few replacements would be used anyway. This change moves the recursion into the regular expression substitution, so that: 1. the regular expression does minimal work on the text (contrary to re-scanning text already scanned in previous frames); 2. but more importantly, replacements aren't computed ahead of time anymore (and even less *several times*), and only fetched from the HTML stash as placeholders are found in the text. The substitution function relies on the regular expression groups ordering: we make sure to match `<p>PLACEHOLDER</p>` first, before `PLACEHOLDER`. The presence of a wrapping `p` tag indicates whether to wrap again the substitution result, or not (also depending on whether the substituted HTML is a block-level tag).
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * DRY fix in `abbr` extension by introducing method `create_element` (#1483).
 * Clean up test directory some removing some redundant tests and port
   non-redundant cases to the newer test framework.
+* Improved performance of the raw HTML post-processor (#1510).
 
 ### Fixed
 
diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py
@@ -28,7 +28,6 @@
 
 from __future__ import annotations
 
-from collections import OrderedDict
 from typing import TYPE_CHECKING, Any
 from . import util
 import re
@@ -73,37 +72,26 @@ class RawHtmlPostprocessor(Postprocessor):
 
     def run(self, text: str) -> str:
         """ Iterate over html stash and restore html. """
-        replacements = OrderedDict()
-        for i in range(self.md.htmlStash.html_counter):
-            html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[i])
-            if self.isblocklevel(html):
-                replacements["<p>{}</p>".format(
-                    self.md.htmlStash.get_placeholder(i))] = html
-            replacements[self.md.htmlStash.get_placeholder(i)] = html
-
         def substitute_match(m: re.Match[str]) -> str:
-            key = m.group(0)
-
-            if key not in replacements:
-                if key[3:-4] in replacements:
-                    return f'<p>{ replacements[key[3:-4]] }</p>'
-                else:
-                    return key
-
-            return replacements[key]
-
-        if replacements:
+            if key := m.group(1):
+                wrapped = True
+            else:
+                key = m.group(2)
+                wrapped = False
+            if (key := int(key)) >= self.md.htmlStash.html_counter:
+                return m.group(0)
+            html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[key])
+            if not wrapped or self.isblocklevel(html):
+                return pattern.sub(substitute_match, html)
+            return pattern.sub(substitute_match, f"<p>{html}</p>")
+
+        if self.md.htmlStash.html_counter:
             base_placeholder = util.HTML_PLACEHOLDER % r'([0-9]+)'
             pattern = re.compile(f'<p>{ base_placeholder }</p>|{ base_placeholder }')
-            processed_text = pattern.sub(substitute_match, text)
+            return pattern.sub(substitute_match, text)
         else:
             return text
 
-        if processed_text == text:
-            return processed_text
-        else:
-            return self.run(processed_text)
-
     def isblocklevel(self, html: str) -> bool:
         """ Check is block of HTML is block-level. """
         m = self.BLOCK_LEVEL_REGEX.match(html)