Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 3 additions & 13 deletions mkdocs/structure/pages.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import copy
import enum
import logging
import posixpath
Expand All @@ -20,6 +19,7 @@
from mkdocs.structure import StructureItem
from mkdocs.structure.toc import get_toc
from mkdocs.utils import _removesuffix, get_build_date, get_markdown_title, meta, weak_property
from mkdocs.utils.rendering import get_heading_text

if TYPE_CHECKING:
from xml.etree import ElementTree as etree
Expand Down Expand Up @@ -555,23 +555,13 @@ class _ExtractTitleTreeprocessor(markdown.treeprocessors.Treeprocessor):
def run(self, root: etree.Element) -> etree.Element:
for el in root:
if el.tag == 'h1':
# Drop anchorlink from the element, if present.
if len(el) > 0 and el[-1].tag == 'a' and not (el[-1].tail or '').strip():
el = copy.copy(el)
del el[-1]
# Extract the text only, recursively.
title = ''.join(el.itertext())
# Unescape per Markdown implementation details.
title = markdown.extensions.toc.stashedHTML2text(
title, self.md, strip_entities=False
)
self.title = title.strip()
self.title = get_heading_text(el, self.md)
break
return root

def _register(self, md: markdown.Markdown) -> None:
self.md = md
md.treeprocessors.register(self, "mkdocs_extract_title", priority=-1) # After the end.
md.treeprocessors.register(self, "mkdocs_extract_title", priority=1) # Close to the end.


class _AbsoluteLinksValidationValue(enum.IntEnum):
Expand Down
42 changes: 34 additions & 8 deletions mkdocs/tests/structure/page_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,31 +342,57 @@ def test_page_title_from_setext_markdown(self):
expected='Welcome to MkDocs Setext',
)

def test_page_title_from_markdown_with_email(self):
self._test_extract_title(
'''# <[email protected]>''',
expected='&#102;&#111;&#111;&#64;&#101;&#120;&#97;&#109;&#112;&#108;&#101;&#46;&#111;&#114;&#103;',
)

def test_page_title_from_markdown_stripped_anchorlinks(self):
self._test_extract_title(
self._SETEXT_CONTENT,
extensions={'toc': {'permalink': '&'}},
expected='Welcome to MkDocs Setext',
)

def test_page_title_from_markdown_strip_footnoteref(self):
foootnotes = '''\n\n[^1]: foo\n[^2]: bar'''
self._test_extract_title(
'''# Header[^1] foo[^2] bar''' + foootnotes,
extensions={'footnotes': {}},
expected='Header foo bar',
)
self._test_extract_title(
'''# *Header[^1]* *foo*[^2]''' + foootnotes,
extensions={'footnotes': {}},
expected='Header foo',
)
self._test_extract_title(
'''# *Header[^1][^2]s''' + foootnotes,
extensions={'footnotes': {}},
expected='*Headers',
)

def test_page_title_from_markdown_strip_formatting(self):
self._test_extract_title(
'''# \\*Hello --- *beautiful* `wor<dl>`''',
extensions={'smarty': {}},
expected='*Hello &mdash; beautiful wor&lt;dl&gt;',
)

def test_page_title_from_markdown_html_entity(self):
self._test_extract_title('''# Foo &lt; &amp; bar''', expected='Foo &lt; &amp; bar')
self._test_extract_title('''# Foo > & bar''', expected='Foo &gt; &amp; bar')

def test_page_title_from_markdown_strip_raw_html(self):
self._test_extract_title(
'''# Hello <b>world</b>''',
expected='Hello world',
)
self._test_extract_title('''# Hello <b>world</b>''', expected='Hello world')

def test_page_title_from_markdown_strip_comments(self):
self._test_extract_title('''# foo <!-- comment with <em> --> bar''', expected='foo bar')

def test_page_title_from_markdown_strip_image(self):
self._test_extract_title(
'''# Hi ![😄](hah.png)''',
expected='Hi', # TODO: Should the alt text of the image be extracted?
)
self._test_extract_title('''# Hi ![😄](hah.png)''', expected='Hi 😄')
self._test_extract_title('''# Hi *-![😄](hah.png)-*''', expected='Hi -😄-')

_ATTRLIST_CONTENT = dedent(
'''
Expand Down
104 changes: 104 additions & 0 deletions mkdocs/utils/rendering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from __future__ import annotations

import copy
from typing import TYPE_CHECKING, Callable

import markdown
import markdown.treeprocessors

if TYPE_CHECKING:
from xml.etree import ElementTree as etree

# TODO: This will become unnecessary after min-versions have Markdown >=3.4
_unescape: Callable[[str], str]
try:
_unescape = markdown.treeprocessors.UnescapeTreeprocessor().unescape
except AttributeError:
_unescape = lambda s: s

# TODO: Most of this file will become unnecessary after https://github.com/Python-Markdown/markdown/pull/1441


def get_heading_text(el: etree.Element, md: markdown.Markdown) -> str:
el = copy.deepcopy(el)
_remove_anchorlink(el)
_remove_fnrefs(el)
_extract_alt_texts(el)
return _strip_tags(_render_inner_html(el, md))


def _strip_tags(text: str) -> str:
"""Strip HTML tags and return plain text. Note: HTML entities are unaffected."""
# A comment could contain a tag, so strip comments first
while (start := text.find('<!--')) != -1 and (end := text.find('-->', start)) != -1:
text = text[:start] + text[end + 3 :]

while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1:
text = text[:start] + text[end + 1 :]

# Collapse whitespace
text = ' '.join(text.split())
return text


def _render_inner_html(el: etree.Element, md: markdown.Markdown) -> str:
# The `UnescapeTreeprocessor` runs after `toc` extension so run here.
text = md.serializer(el)
text = _unescape(text)

# Strip parent tag
start = text.index('>') + 1
end = text.rindex('<')
text = text[start:end].strip()

for pp in md.postprocessors:
text = pp.run(text)
return text


def _remove_anchorlink(el: etree.Element) -> None:
"""Drop anchorlink from the element, if present."""
if len(el) > 0 and el[-1].tag == 'a' and el[-1].get('class') == 'headerlink':
del el[-1]


def _remove_fnrefs(root: etree.Element) -> None:
"""Remove footnote references from the element, if any are present."""
for parent in root.findall('.//sup[@id]/..'):
_replace_elements_with_text(parent, _predicate_for_fnrefs)


def _predicate_for_fnrefs(el: etree.Element) -> str | None:
if el.tag == 'sup' and el.get('id', '').startswith('fnref'):
return ''
return None


def _extract_alt_texts(root: etree.Element) -> None:
"""For images that have an `alt` attribute, replace them with this content."""
for parent in root.findall('.//img[@alt]/..'):
_replace_elements_with_text(parent, _predicate_for_alt_texts)


def _predicate_for_alt_texts(el: etree.Element) -> str | None:
if el.tag == 'img' and (alt := el.get('alt')):
return alt
return None


def _replace_elements_with_text(
parent: etree.Element, predicate: Callable[[etree.Element], str | None]
) -> None:
"""For each child element, if matched, replace it with the text returned from the predicate."""
carry_text = ""
for child in reversed(parent): # Reversed for the ability to mutate during iteration.
# Remove matching elements but carry any `tail` text to preceding elements.
new_text = predicate(child)
if new_text is not None:
carry_text = new_text + (child.tail or "") + carry_text
parent.remove(child)
elif carry_text:
child.tail = (child.tail or "") + carry_text
carry_text = ""
if carry_text:
parent.text = (parent.text or "") + carry_text
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ dependencies = [
"click >=7.0",
"Jinja2 >=2.11.1",
"markupsafe >=2.0.1",
"Markdown >=3.4.1",
"Markdown >=3.3.6",
"PyYAML >=5.1",
"watchdog >=2.0",
"ghp-import >=1.0",
Expand All @@ -57,7 +57,7 @@ min-versions = [
"click ==7.0",
"Jinja2 ==2.11.1",
"markupsafe ==2.0.1",
"Markdown ==3.4.1",
"Markdown ==3.3.6",
"PyYAML ==5.1",
"watchdog ==2.0",
"ghp-import ==1.0",
Expand Down