diff --git a/pelican/contents.py b/pelican/contents.py index 3d9e086c..c0a3e775 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -27,6 +27,7 @@ from pelican.utils import ( sanitised_join, set_date_tzinfo, slugify, + strip_toc_elements_from_html, truncate_html_paragraphs, truncate_html_words, ) @@ -446,13 +447,19 @@ class Content: content = truncate_html_paragraphs(self.content, max_paragraphs) if self.settings["SUMMARY_MAX_LENGTH"] is None: - return content + summary = content + else: + summary = truncate_html_words( + content, + self.settings["SUMMARY_MAX_LENGTH"], + self.settings["SUMMARY_END_SUFFIX"], + ) - return truncate_html_words( - content, - self.settings["SUMMARY_MAX_LENGTH"], - self.settings["SUMMARY_END_SUFFIX"], - ) + # Strip TOC elements that would contain broken links in summary context + # TOC anchors only work in full article view, not in summaries/excerpts + summary = strip_toc_elements_from_html(summary) + + return summary @property def summary(self) -> str: diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index 4b1effa2..4e48e594 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -172,6 +172,28 @@ class TestPage(TestBase): ) self.assertIn("test_marker", page.summary) + def test_summary_strips_toc_elements(self): + """Auto-generated summary should strip TOC divs and toc-backref anchors.""" + page_kwargs = self._copy_page_kwargs() + settings = get_settings() + page_kwargs["settings"] = settings + del page_kwargs["metadata"]["summary"] + toc_content = ( + '
Table of contents
" + '' + "First paragraph of real content.
" + ) + page_kwargs["content"] = toc_content + settings["SUMMARY_MAX_LENGTH"] = None + page = Page(**page_kwargs) + self.assertNotIn('First paragraph of real content.
", page.summary) + def test_summary_get_summary_warning(self): """calling ._get_summary() should issue a warning""" page_kwargs = self._copy_page_kwargs() diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index b5a53eac..aad07555 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -425,6 +425,60 @@ class TestUtils(LoggedTestCase): self.assertEqual(utils.truncate_html_paragraphs(three, 3), three) self.assertEqual(utils.truncate_html_paragraphs(three, 4), three) + def test_strip_toc_elements_from_html(self): + # Test removing TOC div with various class names + html_with_toc = ( + 'Table of Contents
' + '' + "Some content here
" + ) + result = utils.strip_toc_elements_from_html(html_with_toc) + self.assertNotIn('Some content
" + ) + result = utils.strip_toc_elements_from_html(html_with_backref) + self.assertNotIn("toc-backref", result) + self.assertNotIn("Section Heading", result) + + # Test combined - remove both TOC div and backrefs + html_combined = ( + 'TOC here
" + "Article content
" + 'More content
" + ) + result = utils.strip_toc_elements_from_html(html_combined) + self.assertNotIn('Just some plain content
" + self.assertEqual(utils.strip_toc_elements_from_html(plain_html), plain_html) + + # Test case-insensitive matching + html_mixed_case = 'TOC
Content
' + result = utils.strip_toc_elements_from_html(html_mixed_case) + self.assertNotIn("CONTENTS", result) + self.assertIn("Content
", result) + def test_process_translations(self): fr_articles = [] en_articles = [] diff --git a/pelican/utils.py b/pelican/utils.py index c17422aa..ee14a031 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -649,6 +649,37 @@ def truncate_html_paragraphs(s, count): return "".join(paragraphs) +def strip_toc_elements_from_html(html: str) -> str: + """Strip table of contents elements from HTML summaries. + + Removes TOC divs (with broken navigation links) and toc-backref anchor + links from headings. Both are necessary since TOC anchor targets don't + exist when summaries are displayed outside full article context + (e.g., homepage, RSS feeds). + + :param html: HTML content to process + :return: Cleaned HTML with TOC elements removed + """ + # Remove the entire