Skip to content

Commit ec8c305

Browse files
authored
Refactor abbr Extension
A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated `AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists, avoiding a conflict between the two extensions. Fixes #1460. The `AbbrPreprocessor` class has been renamed to `AbbrBlockprocessor`, which better reflects what it is. `AbbrPreprocessor` has been deprecated. A call to `Markdown.reset()` now clears all previously defined abbreviations.
1 parent 993b57b commit ec8c305

File tree

3 files changed

+158
-15
lines changed

3 files changed

+158
-15
lines changed

docs/changelog.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010

1111
## [unreleased]
1212

13+
### Changed
14+
15+
#### Refactor `abbr` Extension
16+
17+
A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated
18+
`AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists,
19+
avoiding a conflict between the two extensions (#1460).
20+
21+
The `AbbrPreprocessor` class has been renamed to `AbbrBlockprocessor`, which
22+
better reflects what it is. `AbbrPreprocessor` has been deprecated.
23+
24+
A call to `Markdown.reset()` now clears all previously defined abbreviations.
25+
1326
### Fixed
1427

1528
* Fixed links to source code on GitHub from the documentation (#1453).

markdown/extensions/abbr.py

Lines changed: 75 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,41 +25,102 @@
2525
from . import Extension
2626
from ..blockprocessors import BlockProcessor
2727
from ..inlinepatterns import InlineProcessor
28-
from ..util import AtomicString
28+
from ..treeprocessors import Treeprocessor
29+
from ..util import AtomicString, deprecated
30+
from typing import TYPE_CHECKING
2931
import re
3032
import xml.etree.ElementTree as etree
3133

34+
if TYPE_CHECKING: # pragma: no cover
35+
from .. import Markdown
36+
from ..blockparsers import BlockParser
37+
3238

3339
class AbbrExtension(Extension):
3440
""" Abbreviation Extension for Python-Markdown. """
3541

36-
def extendMarkdown(self, md):
37-
""" Insert `AbbrPreprocessor` before `ReferencePreprocessor`. """
38-
md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16)
42+
def __init__(self, **kwargs):
43+
""" Initiate Extension and set up configs. """
44+
super().__init__(**kwargs)
45+
self.abbrs = {}
3946

47+
def reset(self):
48+
""" Clear all previously defined abbreviations. """
49+
self.abbrs.clear()
4050

41-
class AbbrPreprocessor(BlockProcessor):
42-
""" Abbreviation Preprocessor - parse text for abbr references. """
51+
def extendMarkdown(self, md):
52+
""" Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """
53+
md.registerExtension(self)
54+
md.treeprocessors.register(AbbrTreeprocessor(md, self.abbrs), 'abbr', 7)
55+
md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs), 'abbr', 16)
56+
57+
58+
class AbbrTreeprocessor(Treeprocessor):
59+
""" Replace abbreviation text with `<abbr>` elements. """
60+
61+
def __init__(self, md: Markdown | None = None, abbrs: dict | None = None):
62+
self.abbrs: dict = abbrs if abbrs is not None else {}
63+
self.RE: re.RegexObject | None = None
64+
super().__init__(md)
65+
66+
def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -> None:
67+
''' Recursively iterate over elements, run regex on text and wrap matches in `abbr` tags. '''
68+
for child in reversed(el):
69+
self.iter_element(child, el)
70+
if text := el.text:
71+
for m in reversed(list(self.RE.finditer(text))):
72+
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
73+
abbr.text = AtomicString(m.group(0))
74+
abbr.tail = text[m.end():]
75+
el.insert(0, abbr)
76+
text = text[:m.start()]
77+
el.text = text
78+
if parent and el.tail:
79+
tail = el.tail
80+
index = list(parent).index(el) + 1
81+
for m in reversed(list(self.RE.finditer(tail))):
82+
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
83+
abbr.text = AtomicString(m.group(0))
84+
abbr.tail = tail[m.end():]
85+
parent.insert(index, abbr)
86+
tail = tail[:m.start()]
87+
el.tail = tail
88+
89+
def run(self, root: etree.Element) -> etree.Element | None:
90+
''' Step through tree to find known abbreviations. '''
91+
if not self.abbrs:
92+
# No abbreviations defined. Skip running processor.
93+
return
94+
# Build and compile regex
95+
self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs) })\\b")
96+
# Step through tree and modify on matches
97+
self.iter_element(root)
98+
99+
100+
class AbbrBlockprocessor(BlockProcessor):
101+
""" Parse text for abbreviation references. """
43102

44103
RE = re.compile(r'^[*]\[(?P<abbr>[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE)
45104

105+
def __init__(self, parser: BlockParser, abbrs: dict):
106+
self.abbrs: dict = abbrs
107+
super().__init__(parser)
108+
46109
def test(self, parent: etree.Element, block: str) -> bool:
47110
return True
48111

49112
def run(self, parent: etree.Element, blocks: list[str]) -> bool:
50113
"""
51-
Find and remove all Abbreviation references from the text.
52-
Each reference is set as a new `AbbrPattern` in the markdown instance.
114+
Find and remove all abbreviation references from the text.
115+
Each reference is added to the abbreviation collection.
53116
54117
"""
55118
block = blocks.pop(0)
56119
m = self.RE.search(block)
57120
if m:
58121
abbr = m.group('abbr').strip()
59122
title = m.group('title').strip()
60-
self.parser.md.inlinePatterns.register(
61-
AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
62-
)
123+
self.abbrs[abbr] = title
63124
if block[m.end():].strip():
64125
# Add any content after match back to blocks as separate block
65126
blocks.insert(0, block[m.end():].lstrip('\n'))
@@ -71,11 +132,11 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
71132
blocks.insert(0, block)
72133
return False
73134

74-
def _generate_pattern(self, text: str) -> str:
75-
""" Given a string, returns a regex pattern to match that string. """
76-
return f"(?P<abbr>\\b{ re.escape(text) }\\b)"
135+
136+
AbbrPreprocessor = deprecated("This class has been renamed to `AbbrBlockprocessor`.")(AbbrBlockprocessor)
77137

78138

139+
@deprecated("This class will be removed in the future; use `AbbrTreeprocessor` instead.")
79140
class AbbrInlineProcessor(InlineProcessor):
80141
""" Abbreviation inline pattern. """
81142

tests/test_syntax/extensions/test_abbr.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
"""
2222

2323
from markdown.test_tools import TestCase
24+
from markdown import Markdown
25+
from markdown.extensions.abbr import AbbrExtension
2426

2527

2628
class TestAbbr(TestCase):
@@ -60,7 +62,7 @@ def test_abbr_lower(self):
6062
)
6163
)
6264

63-
def test_abbr_multiple(self):
65+
def test_abbr_multiple_in_text(self):
6466
self.assertMarkdownRenders(
6567
self.dedent(
6668
"""
@@ -79,6 +81,44 @@ def test_abbr_multiple(self):
7981
)
8082
)
8183

84+
def test_abbr_multiple_in_tail(self):
85+
self.assertMarkdownRenders(
86+
self.dedent(
87+
"""
88+
*The* HTML specification
89+
is maintained by the W3C.
90+
91+
*[HTML]: Hyper Text Markup Language
92+
*[W3C]: World Wide Web Consortium
93+
"""
94+
),
95+
self.dedent(
96+
"""
97+
<p><em>The</em> <abbr title="Hyper Text Markup Language">HTML</abbr> specification
98+
is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p>
99+
"""
100+
)
101+
)
102+
103+
def test_abbr_multiple_nested(self):
104+
self.assertMarkdownRenders(
105+
self.dedent(
106+
"""
107+
The *HTML* specification
108+
is maintained by the *W3C*.
109+
110+
*[HTML]: Hyper Text Markup Language
111+
*[W3C]: World Wide Web Consortium
112+
"""
113+
),
114+
self.dedent(
115+
"""
116+
<p>The <em><abbr title="Hyper Text Markup Language">HTML</abbr></em> specification
117+
is maintained by the <em><abbr title="World Wide Web Consortium">W3C</abbr></em>.</p>
118+
"""
119+
)
120+
)
121+
82122
def test_abbr_override(self):
83123
self.assertMarkdownRenders(
84124
self.dedent(
@@ -325,3 +365,32 @@ def test_abbr_bracket(self):
325365
"""
326366
)
327367
)
368+
369+
def test_abbr_with_attr_list(self):
370+
self.assertMarkdownRenders(
371+
self.dedent(
372+
"""
373+
*[abbr]: Abbreviation Definition
374+
375+
![Image with abbr in title](abbr.png){title="Image with abbr in title"}
376+
"""
377+
),
378+
self.dedent(
379+
"""
380+
<p><img alt="Image with abbr in title" src="abbr.png" title="Image with abbr in title" /></p>
381+
"""
382+
),
383+
extensions=['abbr', 'attr_list']
384+
)
385+
386+
def test_abbr_reset(self):
387+
ext = AbbrExtension()
388+
md = Markdown(extensions=[ext])
389+
md.convert('*[abbr]: Abbreviation Definition')
390+
self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition'})
391+
md.convert('*[ABBR]: Capitalised Abbreviation')
392+
self.assertEqual(ext.abbrs, {'abbr': 'Abbreviation Definition', 'ABBR': 'Capitalised Abbreviation'})
393+
md.reset()
394+
self.assertEqual(ext.abbrs, {})
395+
md.convert('*[foo]: Foo Definition')
396+
self.assertEqual(ext.abbrs, {'foo': 'Foo Definition'})

0 commit comments

Comments
 (0)