Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor TOC sanitation #1441

Merged
merged 18 commits into from
Mar 8, 2024
Merged
133 changes: 90 additions & 43 deletions markdown/extensions/toc.py
waylan marked this conversation as resolved.
Show resolved Hide resolved
waylan marked this conversation as resolved.
Show resolved Hide resolved
waylan marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@

from . import Extension
from ..treeprocessors import Treeprocessor
from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString
from ..util import parseBoolValue, AMP_SUBSTITUTE
from ..treeprocessors import UnescapeTreeprocessor
from ..serializers import RE_AMP
import re
import html
import unicodedata
from copy import deepcopy
from html import unescape as html_unescape
import xml.etree.ElementTree as etree
from typing import TYPE_CHECKING, Any, Iterator, MutableSet

Expand All @@ -35,6 +38,8 @@

def slugify(value: str, separator: str, unicode: bool = False) -> str:
""" Slugify a string, to make it URL friendly. """
# First convert HTML entities to Unicode characters
value = html_unescape(value)
if not unicode:
# Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty`
value = unicodedata.normalize('NFKD', value)
Expand Down Expand Up @@ -63,41 +68,81 @@
return id


def get_name(el: etree.Element) -> str:
waylan marked this conversation as resolved.
Show resolved Hide resolved
"""Get title name."""

text = []
for c in el.itertext():
if isinstance(c, AtomicString):
text.append(html.unescape(c))
else:
text.append(c)
return ''.join(text).strip()


def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str:
""" Extract raw HTML from stash, reduce to plain text and swap with placeholder. """
def _html_sub(m: re.Match[str]) -> str:
""" Substitute raw html with plain text. """
try:
raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))]
except (IndexError, TypeError): # pragma: no cover
return m.group(0)
# Strip out tags and/or entities - leaving text
res = re.sub(r'(<[^>]+>)', '', raw)
if strip_entities:
res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res)
return res

return HTML_PLACEHOLDER_RE.sub(_html_sub, text)


def unescape(text: str) -> str:
""" Unescape escaped text. """
def md_unescape(text: str) -> str:
""" Unescape Markdown backslash escaped text. """
c = UnescapeTreeprocessor()
return c.unescape(text)


def strip_tags(text: str) -> str:
""" Strip HTML tags and return plain text. Note: HTML entities are unaffected. """
# A comment could contain a tag, so strip comments first
while (start := text.find('<!--')) != -1 and (end := text.find('-->', start)) != -1:
text = f'{text[:start]}{text[end + 3:]}'

Check warning on line 81 in markdown/extensions/toc.py

View check run for this annotation

Codecov / codecov/patch

markdown/extensions/toc.py#L81

Added line #L81 was not covered by tests

while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1:
text = f'{text[:start]}{text[end + 1:]}'

# Collapse whitespace
text = ' '.join(text.split())
return text


def escape_cdata(text: str) -> str:
""" Escape character data. """
if "&" in text:
# Only replace & when not part of an entity
text = RE_AMP.sub('&amp;', text)

Check warning on line 95 in markdown/extensions/toc.py

View check run for this annotation

Codecov / codecov/patch

markdown/extensions/toc.py#L95

Added line #L95 was not covered by tests
if "<" in text:
text = text.replace("<", "&lt;")

Check warning on line 97 in markdown/extensions/toc.py

View check run for this annotation

Codecov / codecov/patch

markdown/extensions/toc.py#L97

Added line #L97 was not covered by tests
if ">" in text:
text = text.replace(">", "&gt;")
return text


def run_postprocessors(text: str, md: Markdown) -> str:
""" Run postprocessors from Markdown instance on text. """
for pp in md.postprocessors:
text = pp.run(text)
return text.strip()


def render_inner_html(el: etree.Element, md: Markdown) -> str:
""" Fully render inner html of an etree element as a string. """
# The UnescapeTreeprocessor runs after TOC so run here.
text = md_unescape(md.serializer(el))

# strip parent tag
start = text.index('>') + 1
end = text.rindex('<')
text = text[start:end].strip()

return run_postprocessors(text, md)


def copy_element(el: etree.Element, exclude_fnrefs=True) -> etree.Element:
""" Return a deep copy of an etree element, optionally with footnote references removed. """
el = deepcopy(el)
# Remove footnote references, which look like this: `<sup id="fnref:1">...</sup>`.
if exclude_fnrefs:
for sup in el.findall('sup'):
id = sup.get('id', '')
if id.startswith('fnref'):
# We have a footnote reference. Remove it.
parent = el.find(f'.//sup[@id="{id}"]..')
if sup.tail:
# Preserve the tail text
siblings = list(parent)
pos = siblings.index(sup)
if pos == 0:
parent.text = f'{parent.text or ""}{sup.tail}'
else:
sibling = siblings[pos - 1]
sibling.tail = f'{sibling.tail or ""}{sup.tail}'
parent.remove(sup)
waylan marked this conversation as resolved.
Show resolved Hide resolved
return el


def nest_toc_tokens(toc_list):
"""Given an unsorted list with errors and skips, return a nested one.

Expand Down Expand Up @@ -300,27 +345,29 @@
for el in doc.iter():
if isinstance(el.tag, str) and self.header_rgx.match(el.tag):
self.set_level(el)
text = get_name(el)
html = render_inner_html(copy_element(el), self.md)
text = strip_tags(html)

# Do not override pre-existing ids
if "id" not in el.attrib:
innertext = unescape(stashedHTML2text(text, self.md))
el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids)
el.attrib["id"] = unique(self.slugify(text, self.sep), used_ids)

if 'data-toc-label' in el.attrib:
text = md_unescape(el.attrib['data-toc-label'])
text = run_postprocessors(text, self.md)
text = strip_tags(text)
text = escape_cdata(text)
# Remove the data-toc-label attribute as it is no longer needed
del el.attrib['data-toc-label']

if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom:
toc_tokens.append({
'level': int(el.tag[-1]),
'id': el.attrib["id"],
'name': unescape(stashedHTML2text(
code_escape(el.attrib.get('data-toc-label', text)),
self.md, strip_entities=False
))
'name': text,
'html': html
})

# Remove the data-toc-label attribute as it is no longer needed
if 'data-toc-label' in el.attrib:
del el.attrib['data-toc-label']

if self.use_anchors:
self.add_anchor(el, el.attrib["id"])
if self.use_permalinks not in [False, None]:
Expand Down
39 changes: 22 additions & 17 deletions tests/test_extensions.py
waylan marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -420,9 +420,9 @@ def testUniqueIds(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'header', 'name': 'Header', 'children': []},
{'level': 1, 'id': 'header_1', 'name': 'Header', 'children': []},
{'level': 1, 'id': 'header_2', 'name': 'Header', 'children': []},
{'level': 1, 'id': 'header', 'name': 'Header', 'html': 'Header', 'children': []},
{'level': 1, 'id': 'header_1', 'name': 'Header', 'html': 'Header', 'children': []},
{'level': 1, 'id': 'header_2', 'name': 'Header', 'html': 'Header', 'children': []},
])

def testHtmlEntities(self):
Expand All @@ -441,7 +441,7 @@ def testHtmlEntities(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &amp; bar', 'children': []},
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &amp; bar', 'html': 'Foo &amp; bar', 'children': []},
])

def testHtmlSpecialChars(self):
Expand All @@ -460,7 +460,7 @@ def testHtmlSpecialChars(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &gt; &amp; bar', 'children': []},
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &gt; &amp; bar', 'html': 'Foo &gt; &amp; bar', 'children': []},
])

def testRawHtml(self):
Expand All @@ -479,7 +479,7 @@ def testRawHtml(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'children': []},
{'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'html': 'Foo <b>Bar</b> Baz.', 'children': []},
])

def testBaseLevel(self):
Expand Down Expand Up @@ -508,9 +508,9 @@ def testBaseLevel(self):
'</div>\n'
)
self.assertEqual(md.toc_tokens, [
{'level': 5, 'id': 'some-header', 'name': 'Some Header', 'children': [
{'level': 6, 'id': 'next-level', 'name': 'Next Level', 'children': []},
{'level': 6, 'id': 'too-high', 'name': 'Too High', 'children': []},
{'level': 5, 'id': 'some-header', 'name': 'Some Header', 'html': 'Some Header', 'children': [
{'level': 6, 'id': 'next-level', 'name': 'Next Level', 'html': 'Next Level', 'children': []},
{'level': 6, 'id': 'too-high', 'name': 'Too High', 'html': 'Too High', 'children': []},
]},
])

Expand All @@ -532,9 +532,13 @@ def testHeaderInlineMarkup(self):
'</ul>\n' # noqa
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'some-header-with-markup', 'name': 'Some Header with markup.', 'children': []},
])
self.assertEqual(self.md.toc_tokens, [{
'level': 1,
'id': 'some-header-with-markup',
'name': 'Some Header with markup.',
'html': 'Some <em>Header</em> with <a href="http://example.com">markup</a>.',
'children': []
}])

def testTitle(self):
""" Test TOC Title. """
Expand All @@ -549,6 +553,7 @@ def testTitle(self):

def testWithAttrList(self):
""" Test TOC with `attr_list` Extension. """
self.maxDiff = None
md = markdown.Markdown(extensions=['toc', 'attr_list'])
text = ('# Header 1\n\n'
'## Header 2 { #foo }\n\n'
Expand Down Expand Up @@ -580,12 +585,12 @@ def testWithAttrList(self):
'</div>\n'
)
self.assertEqual(md.toc_tokens, [
{'level': 1, 'id': 'header-1', 'name': 'Header 1', 'children': [
{'level': 2, 'id': 'foo', 'name': 'Header 2', 'children': []},
{'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'children': []}
{'level': 1, 'id': 'header-1', 'name': 'Header 1', 'html': 'Header 1', 'children': [
{'level': 2, 'id': 'foo', 'name': 'Header 2', 'html': 'Header 2', 'children': []},
{'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'html': 'Header 3', 'children': []}
]},
{'level': 1, 'id': 'header-4', 'name': 'Foo &gt; Baz', 'children': []},
{'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'children': []},
{'level': 1, 'id': 'header-4', 'name': 'Foo &gt; Baz', 'html': 'Header 4', 'children': []},
{'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'html': 'Header 5', 'children': []},
])

def testUniqueFunc(self):
Expand Down
1 change: 1 addition & 0 deletions tests/test_syntax/extensions/test_smarty.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ def test_smarty_and_toc(self):
'level': 1,
'id': 'foo-bar',
'name': 'Foo &mdash; bar',
'html': '<em>Foo</em> &mdash; <code>bar</code>',
'children': [],
},
],
Expand Down
Loading
Loading