mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
Strip TOC elements from article summaries
Automatically remove table of contents divs and toc-backref anchor links from article summaries when displayed outside full article context (e.g., on homepage, in RSS feeds). ReStructuredText automatically generates anchor links in section headings when a table of contents directive is present. These anchors work perfectly on full article pages, but become broken links when article summaries appear on homepage or in feeds - the anchor targets don't exist in that context. This change adds a strip_toc_elements_from_html() function in pelican/utils.py that uses regex to remove: - TOC div blocks (<div class="contents">...</div>) containing broken navigation - toc-backref anchor links from headings while preserving heading text Both removals are necessary since TOC anchor targets don't exist in summary context. The function is called automatically in Content.get_summary() so all summaries are cleaned without requiring configuration or template changes. Includes comprehensive unit tests covering various TOC formats, edge cases, and case-insensitive matching.
This commit is contained in:
parent
b7408cbfe9
commit
3ccc6b2159
3 changed files with 98 additions and 6 deletions
|
|
@ -27,6 +27,7 @@ from pelican.utils import (
|
|||
sanitised_join,
|
||||
set_date_tzinfo,
|
||||
slugify,
|
||||
strip_toc_elements_from_html,
|
||||
truncate_html_paragraphs,
|
||||
truncate_html_words,
|
||||
)
|
||||
|
|
@ -446,13 +447,19 @@ class Content:
|
|||
content = truncate_html_paragraphs(self.content, max_paragraphs)
|
||||
|
||||
if self.settings["SUMMARY_MAX_LENGTH"] is None:
|
||||
return content
|
||||
summary = content
|
||||
else:
|
||||
summary = truncate_html_words(
|
||||
content,
|
||||
self.settings["SUMMARY_MAX_LENGTH"],
|
||||
self.settings["SUMMARY_END_SUFFIX"],
|
||||
)
|
||||
|
||||
return truncate_html_words(
|
||||
content,
|
||||
self.settings["SUMMARY_MAX_LENGTH"],
|
||||
self.settings["SUMMARY_END_SUFFIX"],
|
||||
)
|
||||
# Strip TOC elements that would contain broken links in summary context
|
||||
# TOC anchors only work in full article view, not in summaries/excerpts
|
||||
summary = strip_toc_elements_from_html(summary)
|
||||
|
||||
return summary
|
||||
|
||||
@property
|
||||
def summary(self) -> str:
|
||||
|
|
|
|||
|
|
@ -420,6 +420,60 @@ class TestUtils(LoggedTestCase):
|
|||
self.assertEqual(utils.truncate_html_paragraphs(three, 3), three)
|
||||
self.assertEqual(utils.truncate_html_paragraphs(three, 4), three)
|
||||
|
||||
def test_strip_toc_elements_from_html(self):
|
||||
# Test removing TOC div with various class names
|
||||
html_with_toc = (
|
||||
'<div class="contents topic" id="table-of-contents">'
|
||||
'<p class="topic-title">Table of Contents</p>'
|
||||
'<ul><li><a href="#section1">Section 1</a></li></ul>'
|
||||
"</div>"
|
||||
"<p>Some content here</p>"
|
||||
)
|
||||
result = utils.strip_toc_elements_from_html(html_with_toc)
|
||||
self.assertNotIn('<div class="contents', result)
|
||||
self.assertIn("<p>Some content here</p>", result)
|
||||
|
||||
# Test removing toc-backref anchors while preserving heading text
|
||||
html_with_backref = (
|
||||
'<h2><a class="toc-backref" href="#id1">Section Heading</a></h2>'
|
||||
"<p>Some content</p>"
|
||||
)
|
||||
result = utils.strip_toc_elements_from_html(html_with_backref)
|
||||
self.assertNotIn("toc-backref", result)
|
||||
self.assertNotIn("<a class=", result)
|
||||
self.assertIn("Section Heading", result)
|
||||
self.assertIn("<h2>Section Heading</h2>", result)
|
||||
|
||||
# Test combined - remove both TOC div and backrefs
|
||||
html_combined = (
|
||||
'<div class="contents">'
|
||||
"<p>TOC here</p>"
|
||||
"</div>"
|
||||
'<h2><a class="toc-backref" href="#id1">the design</a></h2>'
|
||||
"<p>Article content</p>"
|
||||
'<h2><a class="toc-backref" href="#id2">key features</a></h2>'
|
||||
"<p>More content</p>"
|
||||
)
|
||||
result = utils.strip_toc_elements_from_html(html_combined)
|
||||
self.assertNotIn('<div class="contents', result)
|
||||
self.assertNotIn("toc-backref", result)
|
||||
self.assertIn("the design", result)
|
||||
self.assertIn("key features", result)
|
||||
self.assertIn("<p>Article content</p>", result)
|
||||
|
||||
# Test empty input
|
||||
self.assertEqual(utils.strip_toc_elements_from_html(""), "")
|
||||
|
||||
# Test HTML without TOC elements (should be unchanged)
|
||||
plain_html = "<p>Just some plain content</p>"
|
||||
self.assertEqual(utils.strip_toc_elements_from_html(plain_html), plain_html)
|
||||
|
||||
# Test case-insensitive matching
|
||||
html_mixed_case = '<div CLASS="CONTENTS"><p>TOC</p></div><p>Content</p>'
|
||||
result = utils.strip_toc_elements_from_html(html_mixed_case)
|
||||
self.assertNotIn("CONTENTS", result)
|
||||
self.assertIn("<p>Content</p>", result)
|
||||
|
||||
def test_process_translations(self):
|
||||
fr_articles = []
|
||||
en_articles = []
|
||||
|
|
|
|||
|
|
@ -643,6 +643,37 @@ def truncate_html_paragraphs(s, count):
|
|||
return "".join(paragraphs)
|
||||
|
||||
|
||||
def strip_toc_elements_from_html(html: str) -> str:
|
||||
"""Strip table of contents elements from HTML summaries.
|
||||
|
||||
Removes TOC divs (with broken navigation links) and toc-backref anchor
|
||||
links from headings. Both are necessary since TOC anchor targets don't
|
||||
exist when summaries are displayed outside full article context
|
||||
(e.g., homepage, RSS feeds).
|
||||
|
||||
:param html: HTML content to process
|
||||
:return: Cleaned HTML with TOC elements removed
|
||||
"""
|
||||
# Remove the entire <div class="contents"> ... </div> block
|
||||
html = re.sub(
|
||||
r'<div\s+class="contents[^"]*"[^>]*>.*?</div>',
|
||||
"",
|
||||
html,
|
||||
flags=re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Remove anchor links from headings (e.g., <a class="toc-backref" href="#id1">text</a>)
|
||||
# These links point to anchors that don't exist in summary context
|
||||
html = re.sub(
|
||||
r'<a[^>]*class="[^"]*toc-backref[^"]*"[^>]*>(.*?)</a>',
|
||||
r"\1",
|
||||
html,
|
||||
flags=re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
|
||||
return html
|
||||
|
||||
|
||||
def process_translations(
|
||||
content_list: list[Content],
|
||||
translation_id: str | Collection[str] | None = None,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue