From d6461130cc9e2f0b85fe8a0cf7a5904f0ca32545 Mon Sep 17 00:00:00 2001 From: Russell Date: Mon, 13 Apr 2026 14:01:25 -0400 Subject: [PATCH] Strip TOC elements from article summaries * Strip TOC elements from article summaries Automatically remove table of contents divs and toc-backref anchor links from article summaries when displayed outside full article context (e.g., on homepage, in RSS feeds). ReStructuredText automatically generates anchor links in section headings when a table of contents directive is present. These anchors work perfectly on full article pages, but become broken links when article summaries appear on homepage or in feeds - the anchor targets don't exist in that context. This change adds a strip_toc_elements_from_html() function in pelican/utils.py that uses regex to remove: - TOC div blocks (
...
) containing broken navigation - toc-backref anchor links from headings while preserving heading text Both removals are necessary since TOC anchor targets don't exist in summary context. The function is called automatically in Content.get_summary() so all summaries are cleaned without requiring configuration or template changes. Includes comprehensive unit tests covering various TOC formats, edge cases, and case-insensitive matching. --------- Co-authored-by: Justin Mayer --- pelican/contents.py | 19 ++++++++---- pelican/tests/test_contents.py | 22 ++++++++++++++ pelican/tests/test_utils.py | 54 ++++++++++++++++++++++++++++++++++ pelican/utils.py | 31 +++++++++++++++++++ 4 files changed, 120 insertions(+), 6 deletions(-) diff --git a/pelican/contents.py b/pelican/contents.py index 3d9e086c..c0a3e775 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -27,6 +27,7 @@ from pelican.utils import ( sanitised_join, set_date_tzinfo, slugify, + strip_toc_elements_from_html, truncate_html_paragraphs, truncate_html_words, ) @@ -446,13 +447,19 @@ class Content: content = truncate_html_paragraphs(self.content, max_paragraphs) if self.settings["SUMMARY_MAX_LENGTH"] is None: - return content + summary = content + else: + summary = truncate_html_words( + content, + self.settings["SUMMARY_MAX_LENGTH"], + self.settings["SUMMARY_END_SUFFIX"], + ) - return truncate_html_words( - content, - self.settings["SUMMARY_MAX_LENGTH"], - self.settings["SUMMARY_END_SUFFIX"], - ) + # Strip TOC elements that would contain broken links in summary context + # TOC anchors only work in full article view, not in summaries/excerpts + summary = strip_toc_elements_from_html(summary) + + return summary @property def summary(self) -> str: diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index 4b1effa2..4e48e594 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -172,6 +172,28 @@ class TestPage(TestBase): ) self.assertIn("test_marker", page.summary) + def test_summary_strips_toc_elements(self): + """Auto-generated summary should strip TOC divs and toc-backref anchors.""" + page_kwargs = self._copy_page_kwargs() + settings = get_settings() + page_kwargs["settings"] = settings + del page_kwargs["metadata"]["summary"] + toc_content = ( + '
' + "

Table of contents

" + '' + "
" + '

My Section

' + "

First paragraph of real content.

" + ) + page_kwargs["content"] = toc_content + settings["SUMMARY_MAX_LENGTH"] = None + page = Page(**page_kwargs) + self.assertNotIn('
My Section", page.summary) + self.assertIn("

First paragraph of real content.

", page.summary) + def test_summary_get_summary_warning(self): """calling ._get_summary() should issue a warning""" page_kwargs = self._copy_page_kwargs() diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index b5a53eac..aad07555 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -425,6 +425,60 @@ class TestUtils(LoggedTestCase): self.assertEqual(utils.truncate_html_paragraphs(three, 3), three) self.assertEqual(utils.truncate_html_paragraphs(three, 4), three) + def test_strip_toc_elements_from_html(self): + # Test removing TOC div with various class names + html_with_toc = ( + '
' + '

Table of Contents

' + '' + "
" + "

Some content here

" + ) + result = utils.strip_toc_elements_from_html(html_with_toc) + self.assertNotIn('
Some content here

", result) + + # Test removing toc-backref anchors while preserving heading text + html_with_backref = ( + '

Section Heading

' + "

Some content

" + ) + result = utils.strip_toc_elements_from_html(html_with_backref) + self.assertNotIn("toc-backref", result) + self.assertNotIn("Section Heading", result) + + # Test combined - remove both TOC div and backrefs + html_combined = ( + '
' + "

TOC here

" + "
" + '

the design

' + "

Article content

" + '

key features

' + "

More content

" + ) + result = utils.strip_toc_elements_from_html(html_combined) + self.assertNotIn('
Article content

", result) + + # Test empty input + self.assertEqual(utils.strip_toc_elements_from_html(""), "") + + # Test HTML without TOC elements (should be unchanged) + plain_html = "

Just some plain content

" + self.assertEqual(utils.strip_toc_elements_from_html(plain_html), plain_html) + + # Test case-insensitive matching + html_mixed_case = '

TOC

Content

' + result = utils.strip_toc_elements_from_html(html_mixed_case) + self.assertNotIn("CONTENTS", result) + self.assertIn("

Content

", result) + def test_process_translations(self): fr_articles = [] en_articles = [] diff --git a/pelican/utils.py b/pelican/utils.py index c17422aa..ee14a031 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -649,6 +649,37 @@ def truncate_html_paragraphs(s, count): return "".join(paragraphs) +def strip_toc_elements_from_html(html: str) -> str: + """Strip table of contents elements from HTML summaries. + + Removes TOC divs (with broken navigation links) and toc-backref anchor + links from headings. Both are necessary since TOC anchor targets don't + exist when summaries are displayed outside full article context + (e.g., homepage, RSS feeds). + + :param html: HTML content to process + :return: Cleaned HTML with TOC elements removed + """ + # Remove the entire
...
block + html = re.sub( + r']*>.*?
', + "", + html, + flags=re.DOTALL | re.IGNORECASE, + ) + + # Remove anchor links from headings (e.g., text) + # These links point to anchors that don't exist in summary context + html = re.sub( + r']*class="[^"]*toc-backref[^"]*"[^>]*>(.*?)', + r"\1", + html, + flags=re.DOTALL | re.IGNORECASE, + ) + + return html + + def process_translations( content_list: list[Content], translation_id: str | Collection[str] | None = None,