From d6461130cc9e2f0b85fe8a0cf7a5904f0ca32545 Mon Sep 17 00:00:00 2001
From: Russell <russell.ballestrini@gmail.com>
Date: Mon, 13 Apr 2026 14:01:25 -0400
Subject: [PATCH] Strip TOC elements from article summaries

* Strip TOC elements from article summaries

Automatically remove table of contents divs and toc-backref anchor links
from article summaries when displayed outside full article context
(e.g., on homepage, in RSS feeds).

ReStructuredText automatically generates anchor links in section headings
when a table of contents directive is present. These anchors work perfectly
on full article pages, but become broken links when article summaries appear
on homepage or in feeds - the anchor targets don't exist in that context.

This change adds a strip_toc_elements_from_html() function in pelican/utils.py
that uses regex to remove:
- TOC div blocks (<div class="contents">...</div>) containing broken navigation
- toc-backref anchor links from headings while preserving heading text

Both removals are necessary since TOC anchor targets don't exist in summary context.

The function is called automatically in Content.get_summary() so all
summaries are cleaned without requiring configuration or template changes.

Includes comprehensive unit tests covering various TOC formats, edge cases,
and case-insensitive matching.

---------

Co-authored-by: Justin Mayer <entroP@gmail.com>
---
 pelican/contents.py            | 19 ++++++++----
 pelican/tests/test_contents.py | 22 ++++++++++++++
 pelican/tests/test_utils.py    | 54 ++++++++++++++++++++++++++++++++++
 pelican/utils.py               | 31 +++++++++++++++++++
 4 files changed, 120 insertions(+), 6 deletions(-)
diff --git a/pelican/contents.py b/pelican/contents.py
index 3d9e086c..c0a3e775 100644
--- a/pelican/contents.py
+++ b/pelican/contents.py
@@ -27,6 +27,7 @@ from pelican.utils import (
     sanitised_join,
     set_date_tzinfo,
     slugify,
+    strip_toc_elements_from_html,
     truncate_html_paragraphs,
     truncate_html_words,
 )
@@ -446,13 +447,19 @@ class Content:
             content = truncate_html_paragraphs(self.content, max_paragraphs)
 
         if self.settings["SUMMARY_MAX_LENGTH"] is None:
-            return content
+            summary = content
+        else:
+            summary = truncate_html_words(
+                content,
+                self.settings["SUMMARY_MAX_LENGTH"],
+                self.settings["SUMMARY_END_SUFFIX"],
+            )
 
-        return truncate_html_words(
-            content,
-            self.settings["SUMMARY_MAX_LENGTH"],
-            self.settings["SUMMARY_END_SUFFIX"],
-        )
+        # Strip TOC elements that would contain broken links in summary context
+        # TOC anchors only work in full article view, not in summaries/excerpts
+        summary = strip_toc_elements_from_html(summary)
+
+        return summary
 
     @property
     def summary(self) -> str:
diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py
index 4b1effa2..4e48e594 100644
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@@ -172,6 +172,28 @@ class TestPage(TestBase):
         )
         self.assertIn("test_marker", page.summary)
 
+    def test_summary_strips_toc_elements(self):
+        """Auto-generated summary should strip TOC divs and toc-backref anchors."""
+        page_kwargs = self._copy_page_kwargs()
+        settings = get_settings()
+        page_kwargs["settings"] = settings
+        del page_kwargs["metadata"]["summary"]
+        toc_content = (
+            '<div class="contents topic" id="toc">'
+            "<p>Table of contents</p>"
+            '<ul><li><a href="#s1">Section</a></li></ul>'
+            "</div>"
+            '<h2><a class="toc-backref" href="#id1">My Section</a></h2>'
+            "<p>First paragraph of real content.</p>"
+        )
+        page_kwargs["content"] = toc_content
+        settings["SUMMARY_MAX_LENGTH"] = None
+        page = Page(**page_kwargs)
+        self.assertNotIn('<div class="contents', page.summary)
+        self.assertNotIn("toc-backref", page.summary)
+        self.assertIn("<h2>My Section</h2>", page.summary)
+        self.assertIn("<p>First paragraph of real content.</p>", page.summary)
+
     def test_summary_get_summary_warning(self):
         """calling ._get_summary() should issue a warning"""
         page_kwargs = self._copy_page_kwargs()
diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py
index b5a53eac..aad07555 100644
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@@ -425,6 +425,60 @@ class TestUtils(LoggedTestCase):
         self.assertEqual(utils.truncate_html_paragraphs(three, 3), three)
         self.assertEqual(utils.truncate_html_paragraphs(three, 4), three)
 
+    def test_strip_toc_elements_from_html(self):
+        # Test removing TOC div with various class names
+        html_with_toc = (
+            '<div class="contents topic" id="table-of-contents">'
+            '<p class="topic-title">Table of Contents</p>'
+            '<ul><li><a href="#section1">Section 1</a></li></ul>'
+            "</div>"
+            "<p>Some content here</p>"
+        )
+        result = utils.strip_toc_elements_from_html(html_with_toc)
+        self.assertNotIn('<div class="contents', result)
+        self.assertIn("<p>Some content here</p>", result)
+
+        # Test removing toc-backref anchors while preserving heading text
+        html_with_backref = (
+            '<h2><a class="toc-backref" href="#id1">Section Heading</a></h2>'
+            "<p>Some content</p>"
+        )
+        result = utils.strip_toc_elements_from_html(html_with_backref)
+        self.assertNotIn("toc-backref", result)
+        self.assertNotIn("<a class=", result)
+        self.assertIn("Section Heading", result)
+        self.assertIn("<h2>Section Heading</h2>", result)
+
+        # Test combined - remove both TOC div and backrefs
+        html_combined = (
+            '<div class="contents">'
+            "<p>TOC here</p>"
+            "</div>"
+            '<h2><a class="toc-backref" href="#id1">the design</a></h2>'
+            "<p>Article content</p>"
+            '<h2><a class="toc-backref" href="#id2">key features</a></h2>'
+            "<p>More content</p>"
+        )
+        result = utils.strip_toc_elements_from_html(html_combined)
+        self.assertNotIn('<div class="contents', result)
+        self.assertNotIn("toc-backref", result)
+        self.assertIn("the design", result)
+        self.assertIn("key features", result)
+        self.assertIn("<p>Article content</p>", result)
+
+        # Test empty input
+        self.assertEqual(utils.strip_toc_elements_from_html(""), "")
+
+        # Test HTML without TOC elements (should be unchanged)
+        plain_html = "<p>Just some plain content</p>"
+        self.assertEqual(utils.strip_toc_elements_from_html(plain_html), plain_html)
+
+        # Test case-insensitive matching
+        html_mixed_case = '<div CLASS="CONTENTS"><p>TOC</p></div><p>Content</p>'
+        result = utils.strip_toc_elements_from_html(html_mixed_case)
+        self.assertNotIn("CONTENTS", result)
+        self.assertIn("<p>Content</p>", result)
+
     def test_process_translations(self):
         fr_articles = []
         en_articles = []
diff --git a/pelican/utils.py b/pelican/utils.py
index c17422aa..ee14a031 100644
--- a/pelican/utils.py
+++ b/pelican/utils.py
@@ -649,6 +649,37 @@ def truncate_html_paragraphs(s, count):
     return "".join(paragraphs)
 
 
+def strip_toc_elements_from_html(html: str) -> str:
+    """Strip table of contents elements from HTML summaries.
+
+    Removes TOC divs (with broken navigation links) and toc-backref anchor
+    links from headings. Both are necessary since TOC anchor targets don't
+    exist when summaries are displayed outside full article context
+    (e.g., homepage, RSS feeds).
+
+    :param html: HTML content to process
+    :return: Cleaned HTML with TOC elements removed
+    """
+    # Remove the entire <div class="contents"> ... </div> block
+    html = re.sub(
+        r'<div\s+class="contents[^"]*"[^>]*>.*?</div>',
+        "",
+        html,
+        flags=re.DOTALL | re.IGNORECASE,
+    )
+
+    # Remove anchor links from headings (e.g., <a class="toc-backref" href="#id1">text</a>)
+    # These links point to anchors that don't exist in summary context
+    html = re.sub(
+        r'<a[^>]*class="[^"]*toc-backref[^"]*"[^>]*>(.*?)</a>',
+        r"\1",
+        html,
+        flags=re.DOTALL | re.IGNORECASE,
+    )
+
+    return html
+
+
 def process_translations(
     content_list: list[Content],
     translation_id: str | Collection[str] | None = None,