Refactor abbr Extension

waylan · web-flow · commit ec8c305fb14e · 2024-04-25T09:46:45.000-04:00
A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated `AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists, avoiding a conflict between the two extensions. Fixes #1460. The `AbbrPreprocessor` class has been renamed to `AbbrBlockprocessor`, which better reflects what it is. `AbbrPreprocessor` has been deprecated. A call to `Markdown.reset()` now clears all previously defined abbreviations.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -10,6 +10,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [unreleased]
 
+### Changed
+
+#### Refactor `abbr` Extension
+
+A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated
+`AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists,
+avoiding a conflict between the two extensions (#1460).
+
+The `AbbrPreprocessor` class has been renamed to `AbbrBlockprocessor`, which
+better reflects what it is. `AbbrPreprocessor` has been deprecated.
+
+A call to `Markdown.reset()` now clears all previously defined abbreviations.
+
 ### Fixed
 
 * Fixed links to source code on GitHub from the documentation (#1453).
diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py
@@ -25,41 +25,102 @@
 from . import Extension
 from ..blockprocessors import BlockProcessor
 from ..inlinepatterns import InlineProcessor
-from ..util import AtomicString
+from ..treeprocessors import Treeprocessor
+from ..util import AtomicString, deprecated
+from typing import TYPE_CHECKING
 import re
 import xml.etree.ElementTree as etree
 
+if TYPE_CHECKING:  # pragma: no cover
+    from .. import Markdown
+    from ..blockparsers import BlockParser
+
 
 class AbbrExtension(Extension):
     """ Abbreviation Extension for Python-Markdown. """
 
-    def extendMarkdown(self, md):
-        """ Insert `AbbrPreprocessor` before `ReferencePreprocessor`. """
-        md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16)
+    def __init__(self, **kwargs):
+        """ Initiate Extension and set up configs. """
+        super().__init__(**kwargs)
+        self.abbrs = {}
 
+    def reset(self):
+        """ Clear all previously defined abbreviations. """
+        self.abbrs.clear()
 
-class AbbrPreprocessor(BlockProcessor):
-    """ Abbreviation Preprocessor - parse text for abbr references. """
+    def extendMarkdown(self, md):
+        """ Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """
+        md.registerExtension(self)
+        md.treeprocessors.register(AbbrTreeprocessor(md, self.abbrs), 'abbr', 7)
+        md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs), 'abbr', 16)
+
+
+class AbbrTreeprocessor(Treeprocessor):
+    """ Replace abbreviation text with `<abbr>` elements. """
+
+    def __init__(self, md: Markdown | None = None, abbrs: dict | None = None):
+        self.abbrs: dict = abbrs if abbrs is not None else {}
+        self.RE: re.RegexObject | None = None
+        super().__init__(md)
+
+    def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -> None:
+        ''' Recursively iterate over elements, run regex on text and wrap matches in `abbr` tags. '''
+        for child in reversed(el):
+            self.iter_element(child, el)
+        if text := el.text:
+            for m in reversed(list(self.RE.finditer(text))):
+                abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
+                abbr.text = AtomicString(m.group(0))
+                abbr.tail = text[m.end():]
+                el.insert(0, abbr)
+                text = text[:m.start()]
+            el.text = text
+        if parent and el.tail:
+            tail = el.tail
+            index = list(parent).index(el) + 1
+            for m in reversed(list(self.RE.finditer(tail))):
+                abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
+                abbr.text = AtomicString(m.group(0))
+                abbr.tail = tail[m.end():]
+                parent.insert(index, abbr)
+                tail = tail[:m.start()]
+            el.tail = tail
+
+    def run(self, root: etree.Element) -> etree.Element | None:
+        ''' Step through tree to find known abbreviations. '''
+        if not self.abbrs:
+            # No abbreviations defined. Skip running processor.
+            return
+        # Build and compile regex
+        self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs) })\\b")
+        # Step through tree and modify on matches
+        self.iter_element(root)
+
+
+class AbbrBlockprocessor(BlockProcessor):
+    """ Parse text for abbreviation references. """
 
     RE = re.compile(r'^[*]\[(?P<abbr>[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE)
 
+    def __init__(self, parser: BlockParser, abbrs: dict):
+        self.abbrs: dict = abbrs
+        super().__init__(parser)
+
     def test(self, parent: etree.Element, block: str) -> bool:
         return True
 
     def run(self, parent: etree.Element, blocks: list[str]) -> bool:
         """
-        Find and remove all Abbreviation references from the text.
-        Each reference is set as a new `AbbrPattern` in the markdown instance.
+        Find and remove all abbreviation references from the text.
+        Each reference is added to the abbreviation collection.
 
         """
         block = blocks.pop(0)
         m = self.RE.search(block)
         if m:
             abbr = m.group('abbr').strip()
             title = m.group('title').strip()
-            self.parser.md.inlinePatterns.register(
-                AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
-            )
+            self.abbrs[abbr] = title
             if block[m.end():].strip():
                 # Add any content after match back to blocks as separate block
                 blocks.insert(0, block[m.end():].lstrip('\n'))
@@ -71,11 +132,11 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
         blocks.insert(0, block)
         return False
 
-    def _generate_pattern(self, text: str) -> str:
-        """ Given a string, returns a regex pattern to match that string. """
-        return f"(?P<abbr>\\b{ re.escape(text) }\\b)"
+
+AbbrPreprocessor = deprecated("This class has been renamed to `AbbrBlockprocessor`.")(AbbrBlockprocessor)
 
 
+@deprecated("This class will be removed in the future; use `AbbrTreeprocessor` instead.")
 class AbbrInlineProcessor(InlineProcessor):
     """ Abbreviation inline pattern. """
 
diff --git a/tests/test_syntax/extensions/test_abbr.py b/tests/test_syntax/extensions/test_abbr.py
@@ -21,6 +21,8 @@
 """
 
 from markdown.test_tools import TestCase
+from markdown import Markdown
+from markdown.extensions.abbr import AbbrExtension
 
 
 class TestAbbr(TestCase):
@@ -60,7 +62,7 @@ def test_abbr_lower(self):
             )
         )
 
-    def test_abbr_multiple(self):
+    def test_abbr_multiple_in_text(self):
         self.assertMarkdownRenders(
             self.dedent(
                 """
@@ -79,6 +81,44 @@ def test_abbr_multiple(self):
             )
         )
 
+    def test_abbr_multiple_in_tail(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *The* HTML specification
+                is maintained by the W3C.
+
+                *[HTML]: Hyper Text Markup Language
+                *[W3C]:  World Wide Web Consortium
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>The</em> <abbr title="Hyper Text Markup Language">HTML</abbr> specification
+                is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p>
+                """
+            )
+        )
+
+    def test_abbr_multiple_nested(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                The *HTML* specification
+                is maintained by the *W3C*.
+
+                *[HTML]: Hyper Text Markup Language
+                *[W3C]:  World Wide Web Consortium
+                """
+            ),
+            self.dedent(
+                """
+                <p>The <em><abbr title="Hyper Text Markup Language">HTML</abbr></em> specification
+                is maintained by the <em><abbr title="World Wide Web Consortium">W3C</abbr></em>.</p>
+                """
+            )
+        )
+
     def test_abbr_override(self):
         self.assertMarkdownRenders(
             self.dedent(
@@ -325,3 +365,32 @@ def test_abbr_bracket(self):
                 """
             )
         )
+
+    def test_abbr_with_attr_list(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *[abbr]: Abbreviation Definition
+
+                ![Image with abbr in title](abbr.png){title="Image with abbr in title"}
+                """
+            ),
+            self.dedent(
+                """
+                <p><img alt="Image with abbr in title" src="abbr.png" title="Image with abbr in title" /></p>
+                """
+            ),
+            extensions=['abbr', 'attr_list']
+        )
+
+    def test_abbr_reset(self):
+        ext = AbbrExtension()
+        md = Markdown(extensions=[ext])
+        md.convert('*[abbr]: Abbreviation Definition')
+        self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition'})
+        md.convert('*[ABBR]: Capitalised Abbreviation')
+        self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition', 'ABBR': 'Capitalised Abbreviation'})
+        md.reset()
+        self.assertEqual(ext.abbrs, {})
+        md.convert('*[foo]: Foo Definition')
+        self.assertEqual(ext.abbrs, {'foo': 'Foo Definition'})