diff --git a/pelican/contents.py b/pelican/contents.py index a06a1a6a..7c19a0dc 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -27,6 +27,7 @@ from pelican.utils import ( sanitised_join, set_date_tzinfo, slugify, + strip_toc_elements_from_html, truncate_html_paragraphs, truncate_html_words, ) @@ -446,13 +447,19 @@ class Content: content = truncate_html_paragraphs(self.content, max_paragraphs) if self.settings["SUMMARY_MAX_LENGTH"] is None: - return content + summary = content + else: + summary = truncate_html_words( + content, + self.settings["SUMMARY_MAX_LENGTH"], + self.settings["SUMMARY_END_SUFFIX"], + ) - return truncate_html_words( - content, - self.settings["SUMMARY_MAX_LENGTH"], - self.settings["SUMMARY_END_SUFFIX"], - ) + # Strip TOC elements that would contain broken links in summary context + # TOC anchors only work in full article view, not in summaries/excerpts + summary = strip_toc_elements_from_html(summary) + + return summary @property def summary(self) -> str: diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 95bf197a..0a2cd971 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -420,6 +420,60 @@ class TestUtils(LoggedTestCase): self.assertEqual(utils.truncate_html_paragraphs(three, 3), three) self.assertEqual(utils.truncate_html_paragraphs(three, 4), three) + def test_strip_toc_elements_from_html(self): + # Test removing TOC div with various class names + html_with_toc = ( + '
Table of Contents
' + '' + "Some content here
" + ) + result = utils.strip_toc_elements_from_html(html_with_toc) + self.assertNotIn('Some content
" + ) + result = utils.strip_toc_elements_from_html(html_with_backref) + self.assertNotIn("toc-backref", result) + self.assertNotIn("Section Heading", result) + + # Test combined - remove both TOC div and backrefs + html_combined = ( + 'TOC here
" + "Article content
" + 'More content
" + ) + result = utils.strip_toc_elements_from_html(html_combined) + self.assertNotIn('Just some plain content
" + self.assertEqual(utils.strip_toc_elements_from_html(plain_html), plain_html) + + # Test case-insensitive matching + html_mixed_case = 'TOC
Content
' + result = utils.strip_toc_elements_from_html(html_mixed_case) + self.assertNotIn("CONTENTS", result) + self.assertIn("Content
", result) + def test_process_translations(self): fr_articles = [] en_articles = [] diff --git a/pelican/utils.py b/pelican/utils.py index eb281ce9..a015fd1e 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -643,6 +643,37 @@ def truncate_html_paragraphs(s, count): return "".join(paragraphs) +def strip_toc_elements_from_html(html: str) -> str: + """Strip table of contents elements from HTML summaries. + + Removes TOC divs (with broken navigation links) and toc-backref anchor + links from headings. Both are necessary since TOC anchor targets don't + exist when summaries are displayed outside full article context + (e.g., homepage, RSS feeds). + + :param html: HTML content to process + :return: Cleaned HTML with TOC elements removed + """ + # Remove the entire