diff --git a/pelican/contents.py b/pelican/contents.py index a06a1a6a..7c19a0dc 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -27,6 +27,7 @@ from pelican.utils import ( sanitised_join, set_date_tzinfo, slugify, + strip_toc_elements_from_html, truncate_html_paragraphs, truncate_html_words, ) @@ -446,13 +447,19 @@ class Content: content = truncate_html_paragraphs(self.content, max_paragraphs) if self.settings["SUMMARY_MAX_LENGTH"] is None: - return content + summary = content + else: + summary = truncate_html_words( + content, + self.settings["SUMMARY_MAX_LENGTH"], + self.settings["SUMMARY_END_SUFFIX"], + ) - return truncate_html_words( - content, - self.settings["SUMMARY_MAX_LENGTH"], - self.settings["SUMMARY_END_SUFFIX"], - ) + # Strip TOC elements that would contain broken links in summary context + # TOC anchors only work in full article view, not in summaries/excerpts + summary = strip_toc_elements_from_html(summary) + + return summary @property def summary(self) -> str: diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 95bf197a..0a2cd971 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -420,6 +420,60 @@ class TestUtils(LoggedTestCase): self.assertEqual(utils.truncate_html_paragraphs(three, 3), three) self.assertEqual(utils.truncate_html_paragraphs(three, 4), three) + def test_strip_toc_elements_from_html(self): + # Test removing TOC div with various class names + html_with_toc = ( + '
' + '

Table of Contents

' + '' + "
" + "

Some content here

" + ) + result = utils.strip_toc_elements_from_html(html_with_toc) + self.assertNotIn('
Some content here

", result) + + # Test removing toc-backref anchors while preserving heading text + html_with_backref = ( + '

Section Heading

' + "

Some content

" + ) + result = utils.strip_toc_elements_from_html(html_with_backref) + self.assertNotIn("toc-backref", result) + self.assertNotIn("Section Heading", result) + + # Test combined - remove both TOC div and backrefs + html_combined = ( + '
' + "

TOC here

" + "
" + '

the design

' + "

Article content

" + '

key features

' + "

More content

" + ) + result = utils.strip_toc_elements_from_html(html_combined) + self.assertNotIn('
Article content

", result) + + # Test empty input + self.assertEqual(utils.strip_toc_elements_from_html(""), "") + + # Test HTML without TOC elements (should be unchanged) + plain_html = "

Just some plain content

" + self.assertEqual(utils.strip_toc_elements_from_html(plain_html), plain_html) + + # Test case-insensitive matching + html_mixed_case = '

TOC

Content

' + result = utils.strip_toc_elements_from_html(html_mixed_case) + self.assertNotIn("CONTENTS", result) + self.assertIn("

Content

", result) + def test_process_translations(self): fr_articles = [] en_articles = [] diff --git a/pelican/utils.py b/pelican/utils.py index eb281ce9..a015fd1e 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -643,6 +643,37 @@ def truncate_html_paragraphs(s, count): return "".join(paragraphs) +def strip_toc_elements_from_html(html: str) -> str: + """Strip table of contents elements from HTML summaries. + + Removes TOC divs (with broken navigation links) and toc-backref anchor + links from headings. Both are necessary since TOC anchor targets don't + exist when summaries are displayed outside full article context + (e.g., homepage, RSS feeds). + + :param html: HTML content to process + :return: Cleaned HTML with TOC elements removed + """ + # Remove the entire
...
block + html = re.sub( + r']*>.*?
', + "", + html, + flags=re.DOTALL | re.IGNORECASE, + ) + + # Remove anchor links from headings (e.g., text) + # These links point to anchors that don't exist in summary context + html = re.sub( + r']*class="[^"]*toc-backref[^"]*"[^>]*>(.*?)', + r"\1", + html, + flags=re.DOTALL | re.IGNORECASE, + ) + + return html + + def process_translations( content_list: list[Content], translation_id: str | Collection[str] | None = None,