From 513abbfdc668946590194c637dbe90ec228aaf6f Mon Sep 17 00:00:00 2001 From: Agathe Date: Tue, 25 Jun 2024 15:07:41 +0200 Subject: [PATCH] Introduce paragraph count summary (#2761) Co-authored-by: Justin Mayer --- docs/content.rst | 5 ++++- docs/settings.rst | 8 ++++++++ pelican/contents.py | 8 +++++++- pelican/tests/test_contents.py | 25 +++++++++++++++++++++++++ pelican/tests/test_utils.py | 17 +++++++++++++++++ pelican/utils.py | 19 +++++++++++++++++++ 6 files changed, 80 insertions(+), 2 deletions(-) diff --git a/docs/content.rst b/docs/content.rst index 4277b838..7d7e2cfa 100644 --- a/docs/content.rst +++ b/docs/content.rst @@ -162,7 +162,10 @@ author you can use ``author`` field. If you do not explicitly specify summary metadata for a given post, the ``SUMMARY_MAX_LENGTH`` setting can be used to specify how many words from the -beginning of an article are used as the summary. +beginning of an article are used as the summary. You can also use an article's +first N paragraphs as its summary using the ``SUMMARY_MAX_PARAGRAPHS`` setting. +If both settings are in use, the specified number of paragraphs will +be used but may be truncated to respect the specified maximum length. You can also extract any metadata from the filename through a regular expression to be set in the ``FILENAME_METADATA`` setting. All named groups diff --git a/docs/settings.rst b/docs/settings.rst index 7269c0bd..93c632d2 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -308,6 +308,14 @@ Basic settings does not otherwise specify a summary. Setting to ``None`` will cause the summary to be a copy of the original content. +.. data:: SUMMARY_MAX_PARAGRAPHS = None + + When creating a short summary of an article, this will be the number of + paragraphs to use as the summary. This only applies if your content + does not otherwise specify a summary. Setting to ``None`` will cause the + summary to use the whole text (up to ``SUMMARY_MAX_LENGTH``) instead of just + the first N paragraphs. + .. data:: SUMMARY_END_SUFFIX = '…' When creating a short summary of an article and the result was truncated to diff --git a/pelican/contents.py b/pelican/contents.py index cf13dabc..5a403261 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -28,6 +28,7 @@ from pelican.utils import ( sanitised_join, set_date_tzinfo, slugify, + truncate_html_paragraphs, truncate_html_words, ) @@ -440,8 +441,13 @@ class Content: if "summary" in self.metadata: return self.metadata["summary"] + content = self.content + max_paragraphs = self.settings.get("SUMMARY_MAX_PARAGRAPHS") + if max_paragraphs is not None: + content = truncate_html_paragraphs(self.content, max_paragraphs) + if self.settings["SUMMARY_MAX_LENGTH"] is None: - return self.content + return content return truncate_html_words( self.content, diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index 81f8907c..06d1a690 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -116,6 +116,31 @@ class TestPage(TestBase): page = Page(**page_kwargs) self.assertEqual(page.summary, "") + def test_summary_paragraph(self): + # If SUMMARY_MAX_PARAGRAPHS is set, the generated summary should + # not exceed the given paragraph count. + page_kwargs = self._copy_page_kwargs() + settings = get_settings() + page_kwargs["settings"] = settings + del page_kwargs["metadata"]["summary"] + settings["SUMMARY_MAX_PARAGRAPHS"] = 1 + settings["SUMMARY_MAX_LENGTH"] = None + page = Page(**page_kwargs) + self.assertEqual(page.summary, TEST_CONTENT) + + def test_summary_paragraph_max_length(self): + # If both SUMMARY_MAX_PARAGRAPHS and SUMMARY_MAX_LENGTH are set, + # the generated summary should not exceed the given paragraph count and + # not exceed the given length. + page_kwargs = self._copy_page_kwargs() + settings = get_settings() + page_kwargs["settings"] = settings + del page_kwargs["metadata"]["summary"] + settings["SUMMARY_MAX_PARAGRAPHS"] = 1 + settings["SUMMARY_MAX_LENGTH"] = 10 + page = Page(**page_kwargs) + self.assertEqual(page.summary, truncate_html_words(TEST_CONTENT, 10)) + def test_summary_end_suffix(self): # If a :SUMMARY_END_SUFFIX: is set, and there is no other summary, # generated summary should contain the specified marker at the end. diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 0da59dd4..c35b756c 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -401,6 +401,23 @@ class TestUtils(LoggedTestCase): self.assertEqual(utils.truncate_html_words("Ӓ text", 20), "Ӓ text") self.assertEqual(utils.truncate_html_words("઼ text", 20), "઼ text") + def test_truncate_html_paragraphs(self): + one = "

one

" + + self.assertEqual(utils.truncate_html_paragraphs(one, 0), "") + self.assertEqual(utils.truncate_html_paragraphs(one, 1), one) + self.assertEqual(utils.truncate_html_paragraphs(one, 2), one) + + two = one + "

two

" + self.assertEqual(utils.truncate_html_paragraphs(two, 1), one) + self.assertEqual(utils.truncate_html_paragraphs(two, 2), two) + + three = two + "

three

" + self.assertEqual(utils.truncate_html_paragraphs(three, 1), one) + self.assertEqual(utils.truncate_html_paragraphs(three, 2), two) + self.assertEqual(utils.truncate_html_paragraphs(three, 3), three) + self.assertEqual(utils.truncate_html_paragraphs(three, 4), three) + def test_process_translations(self): fr_articles = [] en_articles = [] diff --git a/pelican/utils.py b/pelican/utils.py index b780ab97..69d9dde5 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -631,6 +631,25 @@ def truncate_html_words(s: str, num: int, end_text: str = "…") -> str: return out +def truncate_html_paragraphs(s, count): + """Truncate HTML to a certain number of paragraphs. + + :param count: number of paragraphs to keep + + Newlines in the HTML are preserved. + """ + paragraphs = [] + tag_stop = 0 + substr = s[:] + for _ in range(count): + substr = substr[tag_stop:] + tag_start = substr.find("

") + tag_stop = substr.find("

") + len("

") + paragraphs.append(substr[tag_start:tag_stop]) + + return "".join(paragraphs) + + def process_translations( content_list: list[Content], translation_id: str | Collection[str] | None = None,