diff --git a/pelican/contents.py b/pelican/contents.py index 3d9e086c..c0a3e775 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -27,6 +27,7 @@ from pelican.utils import ( sanitised_join, set_date_tzinfo, slugify, + strip_toc_elements_from_html, truncate_html_paragraphs, truncate_html_words, ) @@ -446,13 +447,19 @@ class Content: content = truncate_html_paragraphs(self.content, max_paragraphs) if self.settings["SUMMARY_MAX_LENGTH"] is None: - return content + summary = content + else: + summary = truncate_html_words( + content, + self.settings["SUMMARY_MAX_LENGTH"], + self.settings["SUMMARY_END_SUFFIX"], + ) - return truncate_html_words( - content, - self.settings["SUMMARY_MAX_LENGTH"], - self.settings["SUMMARY_END_SUFFIX"], - ) + # Strip TOC elements that would contain broken links in summary context + # TOC anchors only work in full article view, not in summaries/excerpts + summary = strip_toc_elements_from_html(summary) + + return summary @property def summary(self) -> str: diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index 4b1effa2..4e48e594 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -172,6 +172,28 @@ class TestPage(TestBase): ) self.assertIn("test_marker", page.summary) + def test_summary_strips_toc_elements(self): + """Auto-generated summary should strip TOC divs and toc-backref anchors.""" + page_kwargs = self._copy_page_kwargs() + settings = get_settings() + page_kwargs["settings"] = settings + del page_kwargs["metadata"]["summary"] + toc_content = ( + '
' + "

Table of contents

" + '' + "
" + '

My Section

' + "

First paragraph of real content.

" + ) + page_kwargs["content"] = toc_content + settings["SUMMARY_MAX_LENGTH"] = None + page = Page(**page_kwargs) + self.assertNotIn('
My Section", page.summary) + self.assertIn("

First paragraph of real content.

", page.summary) + def test_summary_get_summary_warning(self): """calling ._get_summary() should issue a warning""" page_kwargs = self._copy_page_kwargs() diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index b5a53eac..aad07555 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -425,6 +425,60 @@ class TestUtils(LoggedTestCase): self.assertEqual(utils.truncate_html_paragraphs(three, 3), three) self.assertEqual(utils.truncate_html_paragraphs(three, 4), three) + def test_strip_toc_elements_from_html(self): + # Test removing TOC div with various class names + html_with_toc = ( + '
' + '

Table of Contents

' + '' + "
" + "

Some content here

" + ) + result = utils.strip_toc_elements_from_html(html_with_toc) + self.assertNotIn('
Some content here

", result) + + # Test removing toc-backref anchors while preserving heading text + html_with_backref = ( + '

Section Heading

' + "

Some content

" + ) + result = utils.strip_toc_elements_from_html(html_with_backref) + self.assertNotIn("toc-backref", result) + self.assertNotIn("Section Heading", result) + + # Test combined - remove both TOC div and backrefs + html_combined = ( + '
' + "

TOC here

" + "
" + '

the design

' + "

Article content

" + '

key features

' + "

More content

" + ) + result = utils.strip_toc_elements_from_html(html_combined) + self.assertNotIn('
Article content

", result) + + # Test empty input + self.assertEqual(utils.strip_toc_elements_from_html(""), "") + + # Test HTML without TOC elements (should be unchanged) + plain_html = "

Just some plain content

" + self.assertEqual(utils.strip_toc_elements_from_html(plain_html), plain_html) + + # Test case-insensitive matching + html_mixed_case = '

TOC

Content

' + result = utils.strip_toc_elements_from_html(html_mixed_case) + self.assertNotIn("CONTENTS", result) + self.assertIn("

Content

", result) + def test_process_translations(self): fr_articles = [] en_articles = [] diff --git a/pelican/utils.py b/pelican/utils.py index c17422aa..ee14a031 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -649,6 +649,37 @@ def truncate_html_paragraphs(s, count): return "".join(paragraphs) +def strip_toc_elements_from_html(html: str) -> str: + """Strip table of contents elements from HTML summaries. + + Removes TOC divs (with broken navigation links) and toc-backref anchor + links from headings. Both are necessary since TOC anchor targets don't + exist when summaries are displayed outside full article context + (e.g., homepage, RSS feeds). + + :param html: HTML content to process + :return: Cleaned HTML with TOC elements removed + """ + # Remove the entire
...
block + html = re.sub( + r']*>.*?
', + "", + html, + flags=re.DOTALL | re.IGNORECASE, + ) + + # Remove anchor links from headings (e.g., text) + # These links point to anchors that don't exist in summary context + html = re.sub( + r']*class="[^"]*toc-backref[^"]*"[^>]*>(.*?)', + r"\1", + html, + flags=re.DOTALL | re.IGNORECASE, + ) + + return html + + def process_translations( content_list: list[Content], translation_id: str | Collection[str] | None = None,