From 7466b13e0a6c2f0a19ee7b8640adfbd8a7a8ec9e Mon Sep 17 00:00:00 2001 From: Salar Nosrati-Ershad Date: Wed, 22 Nov 2023 22:54:30 +0330 Subject: [PATCH 01/22] fix: keep newline at the end of the file in tools As referenced in Jinja documentation about whitespace control: > To keep single trailing newlines, configure Jinja to > `keep_trailing_newline` I added this to our Jinja environment to keep EOL new line in tools scripts --- RELEASE.md | 3 +++ pelican/tools/pelican_quickstart.py | 1 + 2 files changed, 4 insertions(+) create mode 100644 RELEASE.md diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 00000000..7881aeac --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,3 @@ +Release type: patch + +Keep the newline at the end of the file in generating tools scripts diff --git a/pelican/tools/pelican_quickstart.py b/pelican/tools/pelican_quickstart.py index db00ce70..a4dc98e1 100755 --- a/pelican/tools/pelican_quickstart.py +++ b/pelican/tools/pelican_quickstart.py @@ -44,6 +44,7 @@ _TEMPLATES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "templ _jinja_env = Environment( loader=FileSystemLoader(_TEMPLATES_DIR), trim_blocks=True, + keep_trailing_newline=True, ) From 4ed5c0d5b87e7711e779be6a26c4a1d9ad21aeaa Mon Sep 17 00:00:00 2001 From: MinchinWeb Date: Sat, 25 Nov 2023 20:57:40 -0700 Subject: [PATCH 02/22] Log the original calling location, rather than the wrapper function --- pelican/log.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pelican/log.py b/pelican/log.py index 0d2b6a3f..befecbf1 100644 --- a/pelican/log.py +++ b/pelican/log.py @@ -85,13 +85,15 @@ class FatalLogger(LimitLogger): warnings_fatal = False errors_fatal = False + # adding `stacklevel=2` means that the displayed filename and line number + # will match the "original" calling location, rather than the wrapper here def warning(self, *args, **kwargs): - super().warning(*args, **kwargs) + super().warning(*args, stacklevel=2, **kwargs) if FatalLogger.warnings_fatal: raise RuntimeError("Warning encountered") def error(self, *args, **kwargs): - super().error(*args, **kwargs) + super().error(*args, stacklevel=2, **kwargs) if FatalLogger.errors_fatal: raise RuntimeError("Error encountered") From 8626d5bd85da049e7ca7828a785d08e02b736aa1 Mon Sep 17 00:00:00 2001 From: Raphael Das Gupta Date: Fri, 22 Dec 2023 15:56:57 +0100 Subject: [PATCH 03/22] docs: update URL to AsciiDoc website https://www.methods.co.nz/asciidoc/ gives a SSL certificate warning and a 404 (page not found) error. https://asciidoc.org is the new official website for the AsciiDoc file format. (It's also what https://en.wikipedia.org/wiki/AsciiDoc links to.) --- docs/content.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/content.rst b/docs/content.rst index cacacea9..8a5d9b32 100644 --- a/docs/content.rst +++ b/docs/content.rst @@ -631,7 +631,7 @@ are not included by default in tag, category, and author indexes, nor in the main article feed. This has the effect of creating an "unlisted" post. .. _W3C ISO 8601: https://www.w3.org/TR/NOTE-datetime -.. _AsciiDoc: https://www.methods.co.nz/asciidoc/ +.. _AsciiDoc: https://asciidoc.org .. _Pelican Plugins: https://github.com/pelican-plugins .. _pelican-plugins: https://github.com/getpelican/pelican-plugins .. _Python-Markdown: https://github.com/Python-Markdown/markdown From f0beb81a973f44ed1c8704984bc325b5f4df095c Mon Sep 17 00:00:00 2001 From: MinchinWeb Date: Sun, 14 Jan 2024 13:45:51 -0700 Subject: [PATCH 04/22] Better error logging if a plugin refuses to load --- pelican/__init__.py | 3 ++- pelican/log.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pelican/__init__.py b/pelican/__init__.py index a25f5624..40251887 100644 --- a/pelican/__init__.py +++ b/pelican/__init__.py @@ -80,7 +80,8 @@ class Pelican: plugin.register() self.plugins.append(plugin) except Exception as e: - logger.error("Cannot register plugin `%s`\n%s", name, e) + logger.error("Cannot register plugin `%s`\n%s", name, e, stacklevel=3) + print(e.stacktrace) self.settings["PLUGINS"] = [get_plugin_name(p) for p in self.plugins] diff --git a/pelican/log.py b/pelican/log.py index befecbf1..6a8fcdf1 100644 --- a/pelican/log.py +++ b/pelican/log.py @@ -88,12 +88,16 @@ class FatalLogger(LimitLogger): # adding `stacklevel=2` means that the displayed filename and line number # will match the "original" calling location, rather than the wrapper here def warning(self, *args, **kwargs): - super().warning(*args, stacklevel=2, **kwargs) + if "stacklevel" not in kwargs.keys(): + kwargs["stacklevel"] = 2 + super().warning(*args, **kwargs) if FatalLogger.warnings_fatal: raise RuntimeError("Warning encountered") def error(self, *args, **kwargs): - super().error(*args, stacklevel=2, **kwargs) + if "stacklevel" not in kwargs.keys(): + kwargs["stacklevel"] = 2 + super().error(*args, **kwargs) if FatalLogger.errors_fatal: raise RuntimeError("Error encountered") From f69e2cca6b5d26c8a6b2f3f4444a2c3de2e2d202 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ricks?= Date: Sun, 17 Dec 2023 13:56:33 +0100 Subject: [PATCH 05/22] Add type hints for settings module Types make it easier to understand the code and improve autocompletion in IDEs. --- pelican/settings.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/pelican/settings.py b/pelican/settings.py index 33ec210a..29051ddb 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -8,11 +8,13 @@ import re import sys from os.path import isabs from pathlib import Path +from types import ModuleType +from typing import Any, Dict, Optional from pelican.log import LimitFilter -def load_source(name, path): +def load_source(name: str, path: str) -> ModuleType: spec = importlib.util.spec_from_file_location(name, path) mod = importlib.util.module_from_spec(spec) sys.modules[name] = mod @@ -22,6 +24,8 @@ def load_source(name, path): logger = logging.getLogger(__name__) +Settings = Dict[str, Any] + DEFAULT_THEME = os.path.join( os.path.dirname(os.path.abspath(__file__)), "themes", "notmyidea" ) @@ -177,7 +181,9 @@ DEFAULT_CONFIG = { PYGMENTS_RST_OPTIONS = None -def read_settings(path=None, override=None): +def read_settings( + path: Optional[str] = None, override: Optional[Settings] = None +) -> Settings: settings = override or {} if path: @@ -221,7 +227,7 @@ def read_settings(path=None, override=None): return settings -def get_settings_from_module(module=None): +def get_settings_from_module(module: Optional[ModuleType] = None) -> Settings: """Loads settings from a module, returns a dictionary.""" context = {} @@ -230,7 +236,7 @@ def get_settings_from_module(module=None): return context -def get_settings_from_file(path): +def get_settings_from_file(path: str) -> Settings: """Loads settings from a file path, returning a dict.""" name, ext = os.path.splitext(os.path.basename(path)) @@ -238,7 +244,7 @@ def get_settings_from_file(path): return get_settings_from_module(module) -def get_jinja_environment(settings): +def get_jinja_environment(settings: Settings) -> Settings: """Sets the environment for Jinja""" jinja_env = settings.setdefault( @@ -253,7 +259,7 @@ def get_jinja_environment(settings): return settings -def _printf_s_to_format_field(printf_string, format_field): +def _printf_s_to_format_field(printf_string: str, format_field: str) -> str: """Tries to replace %s with {format_field} in the provided printf_string. Raises ValueError in case of failure. """ @@ -269,7 +275,7 @@ def _printf_s_to_format_field(printf_string, format_field): return result -def handle_deprecated_settings(settings): +def handle_deprecated_settings(settings: Settings) -> Settings: """Converts deprecated settings and issues warnings. Issues an exception if both old and new setting is specified. """ @@ -566,7 +572,7 @@ def handle_deprecated_settings(settings): return settings -def configure_settings(settings): +def configure_settings(settings: Settings) -> Settings: """Provide optimizations, error checking, and warnings for the given settings. Also, specify the log messages to be ignored. From bf4fd679a5322433cc4313a80cac49d4ed6c348f Mon Sep 17 00:00:00 2001 From: boxydog <93335439+boxydog@users.noreply.github.com> Date: Mon, 15 Jan 2024 03:43:19 -0600 Subject: [PATCH 06/22] Document how to import posts from Medium (#3262) --- docs/importer.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/importer.rst b/docs/importer.rst index 997a4632..08092984 100644 --- a/docs/importer.rst +++ b/docs/importer.rst @@ -26,6 +26,12 @@ not be converted (as Pelican also supports Markdown). manually, or use a plugin such as `More Categories`_ that enables multiple categories per article. +.. note:: + + Imported pages may contain links to images that still point to the original site. + So you might want to download those images into your local content and manually + re-link them from the relevant pages of your site. + Dependencies ============ @@ -121,6 +127,15 @@ For WordPress:: $ pelican-import --wpfile -o ~/output ~/posts.xml +For Medium (an example of using an RSS feed): + + $ python -m pip install feedparser + $ pelican-import --feed https://medium.com/feed/@username + +.. note:: + + The RSS feed may only return the most recent posts — not all of them. + Tests ===== From 5e6dba73acfd6a85560d82870d1cda9d184c3cb5 Mon Sep 17 00:00:00 2001 From: Salar Nosrati-Ershad Date: Mon, 15 Jan 2024 13:33:54 +0330 Subject: [PATCH 07/22] Add Github Pages commit message variable (#3250) --- pelican/tools/templates/Makefile.jinja2 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pelican/tools/templates/Makefile.jinja2 b/pelican/tools/templates/Makefile.jinja2 index 93ab1aa7..1e9dbff5 100644 --- a/pelican/tools/templates/Makefile.jinja2 +++ b/pelican/tools/templates/Makefile.jinja2 @@ -37,6 +37,7 @@ DROPBOX_DIR={{dropbox_dir}} {% endif %} {% if github %} GITHUB_PAGES_BRANCH={{github_pages_branch}} +GITHUB_PAGES_COMMIT_MESSAGE=Generate Pelican site {% endif %} @@ -161,7 +162,7 @@ cf_upload: publish {% if github %} {% set upload = upload + ["github"] %} github: publish - ghp-import -m "Generate Pelican site" -b $(GITHUB_PAGES_BRANCH) "$(OUTPUTDIR)" + ghp-import -m "$(GITHUB_PAGES_COMMIT_MESSAGE)" -b $(GITHUB_PAGES_BRANCH) "$(OUTPUTDIR)" git push origin $(GITHUB_PAGES_BRANCH) {% endif %} From b1cb6c7326e32afba373113b86d823d46f94a812 Mon Sep 17 00:00:00 2001 From: Salar Nosrati-Ershad Date: Mon, 15 Jan 2024 13:40:12 +0330 Subject: [PATCH 08/22] Use `--no-jekyll` flag when invoking `ghp-import` (#3259) --- pelican/tools/templates/Makefile.jinja2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pelican/tools/templates/Makefile.jinja2 b/pelican/tools/templates/Makefile.jinja2 index 1e9dbff5..67571b47 100644 --- a/pelican/tools/templates/Makefile.jinja2 +++ b/pelican/tools/templates/Makefile.jinja2 @@ -162,7 +162,7 @@ cf_upload: publish {% if github %} {% set upload = upload + ["github"] %} github: publish - ghp-import -m "$(GITHUB_PAGES_COMMIT_MESSAGE)" -b $(GITHUB_PAGES_BRANCH) "$(OUTPUTDIR)" + ghp-import -m "$(GITHUB_PAGES_COMMIT_MESSAGE)" -b $(GITHUB_PAGES_BRANCH) "$(OUTPUTDIR)" --no-jekyll git push origin $(GITHUB_PAGES_BRANCH) {% endif %} From d6a33f1d21a8cbb34b584895554147ad97e97a72 Mon Sep 17 00:00:00 2001 From: boxydog Date: Fri, 1 Dec 2023 11:27:16 -0600 Subject: [PATCH 09/22] Medium post importer (from medium export) --- docs/content.rst | 4 +- docs/importer.rst | 13 +- pelican/tests/content/medium_post_content.txt | 4 + ...2017-04-21_-medium-post--d1bf01d62ba3.html | 72 ++++++++ pelican/tests/test_generators.py | 37 +++- pelican/tests/test_importer.py | 83 +++++++++ pelican/tools/pelican_import.py | 165 +++++++++++++++++- 7 files changed, 357 insertions(+), 21 deletions(-) create mode 100644 pelican/tests/content/medium_post_content.txt create mode 100644 pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html diff --git a/docs/content.rst b/docs/content.rst index cacacea9..46db1140 100644 --- a/docs/content.rst +++ b/docs/content.rst @@ -439,8 +439,8 @@ For **Markdown**, one must rely on an extension. For example, using the `mdx_inc Importing an existing site ========================== -It is possible to import your site from WordPress, Tumblr, Dotclear, and RSS -feeds using a simple script. See :ref:`import`. +It is possible to import your site from several other blogging sites +(like WordPress, Tumblr, ..) using a simple script. See :ref:`import`. Translations ============ diff --git a/docs/importer.rst b/docs/importer.rst index 997a4632..093ef465 100644 --- a/docs/importer.rst +++ b/docs/importer.rst @@ -11,6 +11,7 @@ software to reStructuredText or Markdown. The supported import formats are: - Blogger XML export - Dotclear export +- Medium export - Tumblr API - WordPress XML export - RSS/Atom feed @@ -65,6 +66,7 @@ Optional arguments -h, --help Show this help message and exit --blogger Blogger XML export (default: False) --dotclear Dotclear export (default: False) + --medium Medium export (default: False) --tumblr Tumblr API (default: False) --wpfile WordPress XML export (default: False) --feed Feed to parse (default: False) @@ -80,8 +82,7 @@ Optional arguments (default: False) --filter-author Import only post from the specified author --strip-raw Strip raw HTML code that can't be converted to markup - such as flash embeds or iframes (wordpress import - only) (default: False) + such as flash embeds or iframes (default: False) --wp-custpost Put wordpress custom post types in directories. If used with --dir-cat option directories will be created as "/post_type/category/" (wordpress import only) @@ -113,6 +114,14 @@ For Dotclear:: $ pelican-import --dotclear -o ~/output ~/backup.txt +For Medium:: + + $ pelican-import --medium -o ~/output ~/medium-export/posts/ + +The Medium export is a zip file. Unzip it, and point this tool to the +"posts" subdirectory. For more information on how to export, see +https://help.medium.com/hc/en-us/articles/115004745787-Export-your-account-data. + For Tumblr:: $ pelican-import --tumblr -o ~/output --blogname= diff --git a/pelican/tests/content/medium_post_content.txt b/pelican/tests/content/medium_post_content.txt new file mode 100644 index 00000000..5e21881c --- /dev/null +++ b/pelican/tests/content/medium_post_content.txt @@ -0,0 +1,4 @@ + +

Title header

A paragraph of content.

Paragraph number two.

A list:

  1. One.
  2. Two.
  3. Three.

A link: link text.

Header 2

A block quote:

quote words strong words

after blockquote

A figure caption.

A final note: Cross-Validated has sometimes been helpful.


Next: Next post +

+

By User Name on .

Canonical link

Exported from Medium on December 1, 2023.

diff --git a/pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html b/pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html new file mode 100644 index 00000000..02d272dc --- /dev/null +++ b/pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html @@ -0,0 +1,72 @@ +A title diff --git a/pelican/tests/test_generators.py b/pelican/tests/test_generators.py index af6f5b1a..8c257b55 100644 --- a/pelican/tests/test_generators.py +++ b/pelican/tests/test_generators.py @@ -264,6 +264,7 @@ class TestArticlesGenerator(unittest.TestCase): def test_generate_context(self): articles_expected = [ + ["A title", "published", "medium_posts", "article"], ["Article title", "published", "Default", "article"], [ "Article with markdown and summary metadata multi", @@ -391,13 +392,24 @@ class TestArticlesGenerator(unittest.TestCase): # terms of process order will define the name for that category categories = [cat.name for cat, _ in self.generator.categories] categories_alternatives = ( - sorted(["Default", "TestCategory", "Yeah", "test", "指導書"]), - sorted(["Default", "TestCategory", "yeah", "test", "指導書"]), + sorted( + ["Default", "TestCategory", "medium_posts", "Yeah", "test", "指導書"] + ), + sorted( + ["Default", "TestCategory", "medium_posts", "yeah", "test", "指導書"] + ), ) self.assertIn(sorted(categories), categories_alternatives) # test for slug categories = [cat.slug for cat, _ in self.generator.categories] - categories_expected = ["default", "testcategory", "yeah", "test", "zhi-dao-shu"] + categories_expected = [ + "default", + "testcategory", + "medium_posts", + "yeah", + "test", + "zhi-dao-shu", + ] self.assertEqual(sorted(categories), sorted(categories_expected)) def test_do_not_use_folder_as_category(self): @@ -549,7 +561,8 @@ class TestArticlesGenerator(unittest.TestCase): granularity: {period["period"] for period in periods} for granularity, periods in period_archives.items() } - expected = {"year": {(1970,), (2010,), (2012,), (2014,)}} + self.maxDiff = None + expected = {"year": {(1970,), (2010,), (2012,), (2014,), (2017,)}} self.assertEqual(expected, abbreviated_archives) # Month archives enabled: @@ -570,7 +583,7 @@ class TestArticlesGenerator(unittest.TestCase): for granularity, periods in period_archives.items() } expected = { - "year": {(1970,), (2010,), (2012,), (2014,)}, + "year": {(1970,), (2010,), (2012,), (2014,), (2017,)}, "month": { (1970, "January"), (2010, "December"), @@ -578,6 +591,7 @@ class TestArticlesGenerator(unittest.TestCase): (2012, "November"), (2012, "October"), (2014, "February"), + (2017, "April"), }, } self.assertEqual(expected, abbreviated_archives) @@ -602,7 +616,7 @@ class TestArticlesGenerator(unittest.TestCase): for granularity, periods in period_archives.items() } expected = { - "year": {(1970,), (2010,), (2012,), (2014,)}, + "year": {(1970,), (2010,), (2012,), (2014,), (2017,)}, "month": { (1970, "January"), (2010, "December"), @@ -610,6 +624,7 @@ class TestArticlesGenerator(unittest.TestCase): (2012, "November"), (2012, "October"), (2014, "February"), + (2017, "April"), }, "day": { (1970, "January", 1), @@ -619,6 +634,7 @@ class TestArticlesGenerator(unittest.TestCase): (2012, "October", 30), (2012, "October", 31), (2014, "February", 9), + (2017, "April", 21), }, } self.assertEqual(expected, abbreviated_archives) @@ -836,8 +852,12 @@ class TestArticlesGenerator(unittest.TestCase): categories = sorted([category.name for category, _ in generator.categories]) categories_expected = [ - sorted(["Default", "TestCategory", "yeah", "test", "指導書"]), - sorted(["Default", "TestCategory", "Yeah", "test", "指導書"]), + sorted( + ["Default", "TestCategory", "medium_posts", "yeah", "test", "指導書"] + ), + sorted( + ["Default", "TestCategory", "medium_posts", "Yeah", "test", "指導書"] + ), ] self.assertIn(categories, categories_expected) @@ -864,6 +884,7 @@ class TestArticlesGenerator(unittest.TestCase): generator.generate_context() expected = [ + "A title", "An Article With Code Block To Test Typogrify Ignore", "Article title", "Article with Nonconformant HTML meta tags", diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 05ef5bbd..916c1183 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -21,6 +21,10 @@ from pelican.tools.pelican_import import ( get_attachments, tumblr2fields, wp2fields, + mediumpost2fields, + mediumposts2fields, + strip_medium_post_content, + medium_slug, ) from pelican.utils import path_to_file_url, slugify @@ -708,3 +712,82 @@ class TestTumblrImporter(TestCaseWithCLocale): posts, posts, ) + + +class TestMediumImporter(TestCaseWithCLocale): + def setUp(self): + super().setUp() + self.test_content_root = "pelican/tests/content" + # The content coming out of parsing is similar, but not the same. + # Beautiful soup rearranges the order of attributes, for example. + # So, we keep a copy of the content for the test. + content_filename = f"{self.test_content_root}/medium_post_content.txt" + with open(content_filename, encoding="utf-8") as the_content_file: + # Many editors and scripts add a final newline, so live with that + # in our test + the_content = the_content_file.read() + assert the_content[-1] == "\n" + the_content = the_content[:-1] + self.post_tuple = ( + "A title", + the_content, + # slug: + "2017-04-21-medium-post", + "2017-04-21 17:11", + "User Name", + None, + (), + "published", + "article", + "html", + ) + + def test_mediumpost2field(self): + """Parse one post""" + post_filename = f"{self.test_content_root}/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html" + val = mediumpost2fields(post_filename) + self.assertEqual(self.post_tuple, val, val) + + def test_mediumposts2field(self): + """Parse all posts in an export directory""" + posts = [ + fields + for fields in mediumposts2fields(f"{self.test_content_root}/medium_posts") + ] + self.assertEqual(1, len(posts)) + self.assertEqual(self.post_tuple, posts[0]) + + def test_strip_content(self): + """Strip out unhelpful tags""" + html_doc = ( + "
This keeps lots of tags, but not " + "the
section
tags
" + ) + soup = BeautifulSoup(html_doc, "html.parser") + self.assertEqual( + "This keeps lots of tags, but not the section tags", + strip_medium_post_content(soup), + ) + + def test_medium_slug(self): + # Remove hex stuff at the end + self.assertEqual( + "2017-04-27_A-long-title", + medium_slug( + "medium-export/posts/2017-04-27_A-long-title--2971442227dd.html" + ), + ) + # Remove "--DRAFT" at the end + self.assertEqual( + "2017-04-27_A-long-title", + medium_slug("medium-export/posts/2017-04-27_A-long-title--DRAFT.html"), + ) + # Remove both (which happens) + self.assertEqual( + "draft_How-to-do", medium_slug("draft_How-to-do--DRAFT--87225c81dddd.html") + ) + # If no hex stuff, leave it alone + self.assertEqual( + "2017-04-27_A-long-title", + medium_slug("medium-export/posts/2017-04-27_A-long-title.html"), + ) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 681a5c45..eb343860 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -15,6 +15,8 @@ from urllib.error import URLError from urllib.parse import quote, urlparse, urlsplit, urlunsplit from urllib.request import urlretrieve +import dateutil.parser + # because logging.setLoggerClass has to be called before logging.getLogger from pelican.log import init from pelican.settings import DEFAULT_CONFIG @@ -114,19 +116,25 @@ def decode_wp_content(content, br=True): return content -def xml_to_soup(xml): - """Opens an xml file""" +def _import_bs4(): + """Import and return bs4, otherwise sys.exit.""" try: - from bs4 import BeautifulSoup + import bs4 except ImportError: error = ( 'Missing dependency "BeautifulSoup4" and "lxml" required to ' "import XML files." ) sys.exit(error) + return bs4 + + +def file_to_soup(xml, features="xml"): + """Reads a file, returns soup.""" + bs4 = _import_bs4() with open(xml, encoding="utf-8") as infile: xmlfile = infile.read() - soup = BeautifulSoup(xmlfile, "xml") + soup = bs4.BeautifulSoup(xmlfile, features) return soup @@ -140,7 +148,7 @@ def get_filename(post_name, post_id): def wp2fields(xml, wp_custpost=False): """Opens a wordpress XML file, and yield Pelican fields""" - soup = xml_to_soup(xml) + soup = file_to_soup(xml) items = soup.rss.channel.findAll("item") for item in items: if item.find("status").string in ["publish", "draft"]: @@ -210,7 +218,7 @@ def wp2fields(xml, wp_custpost=False): def blogger2fields(xml): """Opens a blogger XML file, and yield Pelican fields""" - soup = xml_to_soup(xml) + soup = file_to_soup(xml) entries = soup.feed.findAll("entry") for entry in entries: raw_kind = entry.find( @@ -536,6 +544,133 @@ def tumblr2fields(api_key, blogname): posts = _get_tumblr_posts(api_key, blogname, offset) +def strip_medium_post_content(soup) -> str: + """Strip some tags and attributes from medium post content. + + For example, the 'section' and 'div' tags cause trouble while rendering. + + The problem with these tags is you can get a section divider (--------------) + that is not between two pieces of content. For example: + + Some text. + + .. container:: section-divider + + -------------- + + .. container:: section-content + + More content. + + In this case, pandoc complains: "Unexpected section title or transition." + + Also, the "id" and "name" attributes in tags cause similar problems. They show + up in .rst as extra junk that separates transitions. + """ + # Remove tags + # section and div cause problems + # footer also can cause problems, and has nothing we want to keep + # See https://stackoverflow.com/a/8439761 + invalid_tags = ["section", "div", "footer"] + for tag in invalid_tags: + for match in soup.findAll(tag): + match.replaceWithChildren() + + # Remove attributes + # See https://stackoverflow.com/a/9045719 + invalid_attributes = ["name", "id", "class"] + bs4 = _import_bs4() + for tag in soup.descendants: + if isinstance(tag, bs4.element.Tag): + tag.attrs = { + key: value + for key, value in tag.attrs.items() + if key not in invalid_attributes + } + + # Get the string of all content, keeping other tags + all_content = "".join(str(element) for element in soup.contents) + return all_content + + +def mediumpost2fields(filepath: str) -> tuple: + """Take an HTML post from a medium export, return Pelican fields.""" + + soup = file_to_soup(filepath, "html.parser") + if not soup: + raise ValueError(f"{filepath} could not be parsed by beautifulsoup") + kind = "article" + + content = soup.find("section", class_="e-content") + if not content: + raise ValueError(f"{filepath}: Post has no content") + + title = soup.find("title").string or "" + + raw_date = soup.find("time", class_="dt-published") + date = None + if raw_date: + # This datetime can include timezone, e.g., "2017-04-21T17:11:55.799Z" + # python before 3.11 can't parse the timezone using datetime.fromisoformat + # See also https://docs.python.org/3.10/library/datetime.html#datetime.datetime.fromisoformat + # "This does not support parsing arbitrary ISO 8601 strings" + # So, we use dateutil.parser, which can handle it. + date_object = dateutil.parser.parse(raw_date.attrs["datetime"]) + date = date_object.strftime("%Y-%m-%d %H:%M") + status = "published" + else: + status = "draft" + author = soup.find("a", class_="p-author h-card") + if author: + author = author.string + + # Now that we're done with classes, we can strip the content + content = strip_medium_post_content(content) + + # medium HTML export doesn't have tag or category + # RSS feed has tags, but it doesn't have all the posts. + tags = () + + slug = medium_slug(filepath) + + # TODO: make the fields a python dataclass + return ( + title, + content, + slug, + date, + author, + None, + tags, + status, + kind, + "html", + ) + + +def medium_slug(filepath: str) -> str: + """Make the filepath of a medium exported file into a slug.""" + # slug: filename without extension + slug = os.path.basename(filepath) + slug = os.path.splitext(slug)[0] + # A medium export filename looks like date_-title-...html + # But, RST doesn't like "_-" (see https://github.com/sphinx-doc/sphinx/issues/4350) + # so get rid of it + slug = slug.replace("_-", "-") + # drop the hex string medium puts on the end of the filename, why keep it. + # e.g., "-a8a8a8a8" or "---a9a9a9a9" + # also: drafts don't need "--DRAFT" + slug = re.sub(r"((-)+([0-9a-f]+|DRAFT))+$", "", slug) + return slug + + +def mediumposts2fields(medium_export_dir: str): + """Take HTML posts in a medium export directory, and yield Pelican fields.""" + for file in os.listdir(medium_export_dir): + filename = os.fsdecode(file) + yield mediumpost2fields(os.path.join(medium_export_dir, filename)) + + def feed2fields(file): """Read a feed and yield pelican fields""" import feedparser @@ -711,7 +846,7 @@ def get_attachments(xml): """returns a dictionary of posts that have attachments with a list of the attachment_urls """ - soup = xml_to_soup(xml) + soup = file_to_soup(xml) items = soup.rss.channel.findAll("item") names = {} attachments = [] @@ -837,6 +972,9 @@ def fields2pelican( posts_require_pandoc.append(filename) slug = not disable_slugs and filename or None + assert slug is None or filename == os.path.basename( + filename + ), f"filename is not a basename: {filename}" if wp_attach and attachments: try: @@ -984,6 +1122,9 @@ def main(): parser.add_argument( "--dotclear", action="store_true", dest="dotclear", help="Dotclear export" ) + parser.add_argument( + "--medium", action="store_true", dest="medium", help="Medium export" + ) parser.add_argument( "--tumblr", action="store_true", dest="tumblr", help="Tumblr export" ) @@ -1069,6 +1210,8 @@ def main(): input_type = "blogger" elif args.dotclear: input_type = "dotclear" + elif args.medium: + input_type = "medium" elif args.tumblr: input_type = "tumblr" elif args.wpfile: @@ -1077,8 +1220,8 @@ def main(): input_type = "feed" else: error = ( - "You must provide either --blogger, --dotclear, " - "--tumblr, --wpfile or --feed options" + "You must provide one of --blogger, --dotclear, " + "--medium, --tumblr, --wpfile or --feed options" ) exit(error) @@ -1097,12 +1240,16 @@ def main(): fields = blogger2fields(args.input) elif input_type == "dotclear": fields = dc2fields(args.input) + elif input_type == "medium": + fields = mediumposts2fields(args.input) elif input_type == "tumblr": fields = tumblr2fields(args.input, args.blogname) elif input_type == "wordpress": fields = wp2fields(args.input, args.wp_custpost or False) elif input_type == "feed": fields = feed2fields(args.input) + else: + raise ValueError(f"Unhandled input_type {input_type}") if args.wp_attach: attachments = get_attachments(args.input) From fbe81a971a8f96eae6a13aee4471468f31cbf194 Mon Sep 17 00:00:00 2001 From: Justin Mayer Date: Wed, 17 Jan 2024 09:48:05 +0100 Subject: [PATCH 10/22] Delete RELEASE.md --- RELEASE.md | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 RELEASE.md diff --git a/RELEASE.md b/RELEASE.md deleted file mode 100644 index 7881aeac..00000000 --- a/RELEASE.md +++ /dev/null @@ -1,3 +0,0 @@ -Release type: patch - -Keep the newline at the end of the file in generating tools scripts From d39dd9b85f0309e4101e74a270fd2ce97f051a84 Mon Sep 17 00:00:00 2001 From: MinchinWeb Date: Sun, 21 Jan 2024 22:52:56 -0700 Subject: [PATCH 11/22] Resolve inter-site links in summaries. c.f. https://github.com/getpelican/pelican/issues/3265 c.f. https://github.com/MinchinWeb/minchin.pelican.plugins.summary/issues/5 --- pelican/__init__.py | 7 +++++-- pelican/contents.py | 17 +++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/pelican/__init__.py b/pelican/__init__.py index a25f5624..1a3090f8 100644 --- a/pelican/__init__.py +++ b/pelican/__init__.py @@ -120,12 +120,15 @@ class Pelican: if hasattr(p, "generate_context"): p.generate_context() + # for plugins that create/edit the summary + logger.debug("Signal all_generators_finalized.send()") + signals.all_generators_finalized.send(generators) + + # update links in the summary, etc for p in generators: if hasattr(p, "refresh_metadata_intersite_links"): p.refresh_metadata_intersite_links() - signals.all_generators_finalized.send(generators) - writer = self._get_writer() for p in generators: diff --git a/pelican/contents.py b/pelican/contents.py index 474e5bbf..27b8bbc3 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -520,12 +520,17 @@ class Content: # _summary is an internal variable that some plugins may be writing to, # so ensure changes to it are picked up - if ( - "summary" in self.settings["FORMATTED_FIELDS"] - and "summary" in self.metadata - ): - self._summary = self._update_content(self._summary, self.get_siteurl()) - self.metadata["summary"] = self._summary + if "summary" in self.settings["FORMATTED_FIELDS"]: + if hasattr(self, "_summary"): + self.metadata["summary"] = self._summary + + if "summary" in self.metadata: + self.metadata["summary"] = self._update_content( + self.metadata["summary"], self.get_siteurl() + ) + + if hasattr(self, "_summary") and "summary" in self.metadata: + self._summary = self.metadata["summary"] class Page(Content): From 2fa5c515b0232ce212a3d83827de88b01deaa598 Mon Sep 17 00:00:00 2001 From: namori <157323136+nam-ori@users.noreply.github.com> Date: Tue, 23 Jan 2024 09:43:07 +0100 Subject: [PATCH 12/22] Feeds - Update generators.py to fix a bug with slugs (#3279) --- pelican/generators.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/pelican/generators.py b/pelican/generators.py index 3b5ca9e4..076c8d38 100644 --- a/pelican/generators.py +++ b/pelican/generators.py @@ -384,8 +384,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["CATEGORY_FEED_ATOM"]).format(slug=cat.slug), self.settings.get( "CATEGORY_FEED_ATOM_URL", - str(self.settings["CATEGORY_FEED_ATOM"]).format(slug=cat.slug), - ), + str(self.settings["CATEGORY_FEED_ATOM"]), + ).format(slug=cat.slug), feed_title=cat.name, ) @@ -396,8 +396,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["CATEGORY_FEED_RSS"]).format(slug=cat.slug), self.settings.get( "CATEGORY_FEED_RSS_URL", - str(self.settings["CATEGORY_FEED_RSS"]).format(slug=cat.slug), - ), + str(self.settings["CATEGORY_FEED_RSS"]), + ).format(slug=cat.slug), feed_title=cat.name, feed_type="rss", ) @@ -410,8 +410,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["AUTHOR_FEED_ATOM"]).format(slug=auth.slug), self.settings.get( "AUTHOR_FEED_ATOM_URL", - str(self.settings["AUTHOR_FEED_ATOM"]).format(slug=auth.slug), - ), + str(self.settings["AUTHOR_FEED_ATOM"]), + ).format(slug=auth.slug), feed_title=auth.name, ) @@ -422,8 +422,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["AUTHOR_FEED_RSS"]).format(slug=auth.slug), self.settings.get( "AUTHOR_FEED_RSS_URL", - str(self.settings["AUTHOR_FEED_RSS"]).format(slug=auth.slug), - ), + str(self.settings["AUTHOR_FEED_RSS"]), + ).format(slug=auth.slug), feed_title=auth.name, feed_type="rss", ) @@ -437,8 +437,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["TAG_FEED_ATOM"]).format(slug=tag.slug), self.settings.get( "TAG_FEED_ATOM_URL", - str(self.settings["TAG_FEED_ATOM"]).format(slug=tag.slug), - ), + str(self.settings["TAG_FEED_ATOM"]), + ).format(slug=tag.slug), feed_title=tag.name, ) @@ -449,8 +449,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["TAG_FEED_RSS"]).format(slug=tag.slug), self.settings.get( "TAG_FEED_RSS_URL", - str(self.settings["TAG_FEED_RSS"]).format(slug=tag.slug), - ), + str(self.settings["TAG_FEED_RSS"]), + ).format(slug=tag.slug), feed_title=tag.name, feed_type="rss", ) @@ -471,10 +471,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["TRANSLATION_FEED_ATOM"]).format(lang=lang), self.settings.get( "TRANSLATION_FEED_ATOM_URL", - str(self.settings["TRANSLATION_FEED_ATOM"]).format( - lang=lang - ), - ), + str(self.settings["TRANSLATION_FEED_ATOM"]), + ).format(lang=lang), ) if self.settings.get("TRANSLATION_FEED_RSS"): writer.write_feed( From 3a662ace031a20d15f4933c028b3fffd1b588430 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ricks?= Date: Thu, 18 Jan 2024 17:17:29 +0100 Subject: [PATCH 13/22] Add type hints for contents module Types make it easier to understand the code and improve autocompletion in IDEs. --- pelican/contents.py | 95 +++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 42 deletions(-) diff --git a/pelican/contents.py b/pelican/contents.py index 474e5bbf..82be8f73 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -6,7 +6,8 @@ import os import re from datetime import timezone from html import unescape -from urllib.parse import unquote, urljoin, urlparse, urlunparse +from typing import Any, Dict, Optional, Set, Tuple +from urllib.parse import ParseResult, unquote, urljoin, urlparse, urlunparse try: from zoneinfo import ZoneInfo @@ -15,7 +16,7 @@ except ModuleNotFoundError: from pelican.plugins import signals -from pelican.settings import DEFAULT_CONFIG +from pelican.settings import DEFAULT_CONFIG, Settings from pelican.utils import ( deprecated_attribute, memoized, @@ -44,12 +45,20 @@ class Content: """ + default_template: Optional[str] = None + mandatory_properties: Tuple[str, ...] = () + @deprecated_attribute(old="filename", new="source_path", since=(3, 2, 0)) def filename(): return None def __init__( - self, content, metadata=None, settings=None, source_path=None, context=None + self, + content: str, + metadata: Optional[Dict[str, Any]] = None, + settings: Optional[Settings] = None, + source_path: Optional[str] = None, + context: Optional[Dict[Any, Any]] = None, ): if metadata is None: metadata = {} @@ -156,10 +165,10 @@ class Content: signals.content_object_init.send(self) - def __str__(self): + def __str__(self) -> str: return self.source_path or repr(self) - def _has_valid_mandatory_properties(self): + def _has_valid_mandatory_properties(self) -> bool: """Test mandatory properties are set.""" for prop in self.mandatory_properties: if not hasattr(self, prop): @@ -169,7 +178,7 @@ class Content: return False return True - def _has_valid_save_as(self): + def _has_valid_save_as(self) -> bool: """Return true if save_as doesn't write outside output path, false otherwise.""" try: @@ -190,7 +199,7 @@ class Content: return True - def _has_valid_status(self): + def _has_valid_status(self) -> bool: if hasattr(self, "allowed_statuses"): if self.status not in self.allowed_statuses: logger.error( @@ -204,7 +213,7 @@ class Content: # if undefined we allow all return True - def is_valid(self): + def is_valid(self) -> bool: """Validate Content""" # Use all() to not short circuit and get results of all validations return all( @@ -216,7 +225,7 @@ class Content: ) @property - def url_format(self): + def url_format(self) -> Dict[str, Any]: """Returns the URL, formatted with the proper values""" metadata = copy.copy(self.metadata) path = self.metadata.get("path", self.get_relative_source_path()) @@ -232,19 +241,19 @@ class Content: ) return metadata - def _expand_settings(self, key, klass=None): + def _expand_settings(self, key: str, klass: Optional[str] = None) -> str: if not klass: klass = self.__class__.__name__ fq_key = (f"{klass}_{key}").upper() return str(self.settings[fq_key]).format(**self.url_format) - def get_url_setting(self, key): + def get_url_setting(self, key: str) -> str: if hasattr(self, "override_" + key): return getattr(self, "override_" + key) key = key if self.in_default_lang else "lang_%s" % key return self._expand_settings(key) - def _link_replacer(self, siteurl, m): + def _link_replacer(self, siteurl: str, m: re.Match) -> str: what = m.group("what") value = urlparse(m.group("value")) path = value.path @@ -272,15 +281,15 @@ class Content: # XXX Put this in a different location. if what in {"filename", "static", "attach"}: - def _get_linked_content(key, url): + def _get_linked_content(key: str, url: ParseResult) -> Optional[Content]: nonlocal value - def _find_path(path): + def _find_path(path: str) -> Optional[Content]: if path.startswith("/"): path = path[1:] else: # relative to the source path of this content - path = self.get_relative_source_path( + path = self.get_relative_source_path( # type: ignore os.path.join(self.relative_dir, path) ) return self._context[key].get(path, None) @@ -324,7 +333,7 @@ class Content: linked_content = _get_linked_content(key, value) if linked_content: if what == "attach": - linked_content.attach_to(self) + linked_content.attach_to(self) # type: ignore origin = joiner(siteurl, linked_content.url) origin = origin.replace("\\", "/") # for Windows paths. else: @@ -359,7 +368,7 @@ class Content: return "".join((m.group("markup"), m.group("quote"), origin, m.group("quote"))) - def _get_intrasite_link_regex(self): + def _get_intrasite_link_regex(self) -> re.Pattern: intrasite_link_regex = self.settings["INTRASITE_LINK_REGEX"] regex = r""" (?P<[^\>]+ # match tag with all url-value attributes @@ -370,7 +379,7 @@ class Content: (?P=quote)""".format(intrasite_link_regex) return re.compile(regex, re.X) - def _update_content(self, content, siteurl): + def _update_content(self, content: str, siteurl: str) -> str: """Update the content attribute. Change all the relative paths of the content to relative paths @@ -386,7 +395,7 @@ class Content: hrefs = self._get_intrasite_link_regex() return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content) - def get_static_links(self): + def get_static_links(self) -> Set[str]: static_links = set() hrefs = self._get_intrasite_link_regex() for m in hrefs.finditer(self._content): @@ -402,15 +411,15 @@ class Content: path = self.get_relative_source_path( os.path.join(self.relative_dir, path) ) - path = path.replace("%20", " ") + path = path.replace("%20", " ") # type: ignore static_links.add(path) return static_links - def get_siteurl(self): + def get_siteurl(self) -> str: return self._context.get("localsiteurl", "") @memoized - def get_content(self, siteurl): + def get_content(self, siteurl: str) -> str: if hasattr(self, "_get_content"): content = self._get_content() else: @@ -418,11 +427,11 @@ class Content: return self._update_content(content, siteurl) @property - def content(self): + def content(self) -> str: return self.get_content(self.get_siteurl()) @memoized - def get_summary(self, siteurl): + def get_summary(self, siteurl: str) -> str: """Returns the summary of an article. This is based on the summary metadata if set, otherwise truncate the @@ -441,10 +450,10 @@ class Content: ) @property - def summary(self): + def summary(self) -> str: return self.get_summary(self.get_siteurl()) - def _get_summary(self): + def _get_summary(self) -> str: """deprecated function to access summary""" logger.warning( @@ -454,34 +463,36 @@ class Content: return self.summary @summary.setter - def summary(self, value): + def summary(self, value: str): """Dummy function""" pass @property - def status(self): + def status(self) -> str: return self._status @status.setter - def status(self, value): + def status(self, value: str) -> None: # TODO maybe typecheck self._status = value.lower() @property - def url(self): + def url(self) -> str: return self.get_url_setting("url") @property - def save_as(self): + def save_as(self) -> str: return self.get_url_setting("save_as") - def _get_template(self): + def _get_template(self) -> str: if hasattr(self, "template") and self.template is not None: return self.template else: return self.default_template - def get_relative_source_path(self, source_path=None): + def get_relative_source_path( + self, source_path: Optional[str] = None + ) -> Optional[str]: """Return the relative path (from the content path) to the given source_path. @@ -501,7 +512,7 @@ class Content: ) @property - def relative_dir(self): + def relative_dir(self) -> str: return posixize_path( os.path.dirname( os.path.relpath( @@ -511,7 +522,7 @@ class Content: ) ) - def refresh_metadata_intersite_links(self): + def refresh_metadata_intersite_links(self) -> None: for key in self.settings["FORMATTED_FIELDS"]: if key in self.metadata and key != "summary": value = self._update_content(self.metadata[key], self.get_siteurl()) @@ -534,7 +545,7 @@ class Page(Content): default_status = "published" default_template = "page" - def _expand_settings(self, key): + def _expand_settings(self, key: str) -> str: klass = "draft_page" if self.status == "draft" else None return super()._expand_settings(key, klass) @@ -561,7 +572,7 @@ class Article(Content): if not hasattr(self, "date") and self.status == "draft": self.date = datetime.datetime.max.replace(tzinfo=self.timezone) - def _expand_settings(self, key): + def _expand_settings(self, key: str) -> str: klass = "draft" if self.status == "draft" else "article" return super()._expand_settings(key, klass) @@ -571,7 +582,7 @@ class Static(Content): default_status = "published" default_template = None - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self._output_location_referenced = False @@ -588,18 +599,18 @@ class Static(Content): return None @property - def url(self): + def url(self) -> str: # Note when url has been referenced, so we can avoid overriding it. self._output_location_referenced = True return super().url @property - def save_as(self): + def save_as(self) -> str: # Note when save_as has been referenced, so we can avoid overriding it. self._output_location_referenced = True return super().save_as - def attach_to(self, content): + def attach_to(self, content: Content) -> None: """Override our output directory with that of the given content object.""" # Determine our file's new output path relative to the linking @@ -624,7 +635,7 @@ class Static(Content): new_url = path_to_url(new_save_as) - def _log_reason(reason): + def _log_reason(reason: str) -> None: logger.warning( "The {attach} link in %s cannot relocate " "%s because %s. Falling back to " From e4807316ae9338f05701a70d216687a94fb796d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ricks?= Date: Thu, 18 Jan 2024 09:18:00 +0100 Subject: [PATCH 14/22] Add type hints for utils module Types make it easier to understand the code and improve autocompletion in IDEs. --- pelican/utils.py | 143 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 97 insertions(+), 46 deletions(-) diff --git a/pelican/utils.py b/pelican/utils.py index eda53d3f..5f161667 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import datetime import fnmatch import locale @@ -16,6 +18,21 @@ from html import entities from html.parser import HTMLParser from itertools import groupby from operator import attrgetter +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Collection, + Dict, + Generator, + Iterable, + List, + Optional, + Sequence, + Tuple, + Type, + Union, +) import dateutil.parser @@ -27,11 +44,15 @@ from markupsafe import Markup import watchfiles +if TYPE_CHECKING: + from pelican.contents import Content + from pelican.readers import Readers + from pelican.settings import Settings logger = logging.getLogger(__name__) -def sanitised_join(base_directory, *parts): +def sanitised_join(base_directory: str, *parts: str) -> str: joined = posixize_path(os.path.abspath(os.path.join(base_directory, *parts))) base = posixize_path(os.path.abspath(base_directory)) if not joined.startswith(base): @@ -40,7 +61,7 @@ def sanitised_join(base_directory, *parts): return joined -def strftime(date, date_format): +def strftime(date: datetime.datetime, date_format: str) -> str: """ Enhanced replacement for built-in strftime with zero stripping @@ -109,10 +130,10 @@ class DateFormatter: defined in LOCALE setting """ - def __init__(self): + def __init__(self) -> None: self.locale = locale.setlocale(locale.LC_TIME) - def __call__(self, date, date_format): + def __call__(self, date: datetime.datetime, date_format: str) -> str: # on OSX, encoding from LC_CTYPE determines the unicode output in PY3 # make sure it's same as LC_TIME with temporary_locale(self.locale, locale.LC_TIME), temporary_locale( @@ -131,11 +152,11 @@ class memoized: """ - def __init__(self, func): + def __init__(self, func: Callable) -> None: self.func = func - self.cache = {} + self.cache: Dict[Any, Any] = {} - def __call__(self, *args): + def __call__(self, *args) -> Any: if not isinstance(args, Hashable): # uncacheable. a list, for instance. # better to not cache than blow up. @@ -147,17 +168,23 @@ class memoized: self.cache[args] = value return value - def __repr__(self): + def __repr__(self) -> Optional[str]: return self.func.__doc__ - def __get__(self, obj, objtype): + def __get__(self, obj: Any, objtype): """Support instance methods.""" fn = partial(self.__call__, obj) fn.cache = self.cache return fn -def deprecated_attribute(old, new, since=None, remove=None, doc=None): +def deprecated_attribute( + old: str, + new: str, + since: Tuple[int, ...], + remove: Optional[Tuple[int, ...]] = None, + doc: Optional[str] = None, +): """Attribute deprecation decorator for gentle upgrades For example: @@ -198,7 +225,7 @@ def deprecated_attribute(old, new, since=None, remove=None, doc=None): return decorator -def get_date(string): +def get_date(string: str) -> datetime.datetime: """Return a datetime object from a string. If no format matches the given date, raise a ValueError. @@ -212,7 +239,9 @@ def get_date(string): @contextmanager -def pelican_open(filename, mode="r", strip_crs=(sys.platform == "win32")): +def pelican_open( + filename: str, mode: str = "r", strip_crs: bool = (sys.platform == "win32") +) -> Generator[str, None, None]: """Open a file and return its content""" # utf-8-sig will clear any BOM if present @@ -221,7 +250,12 @@ def pelican_open(filename, mode="r", strip_crs=(sys.platform == "win32")): yield content -def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False): +def slugify( + value: str, + regex_subs: Iterable[Tuple[str, str]] = (), + preserve_case: bool = False, + use_unicode: bool = False, +) -> str: """ Normalizes string, converts to lowercase, removes non-alpha characters, and converts spaces to hyphens. @@ -233,9 +267,10 @@ def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False): """ import unicodedata + import unidecode - def normalize_unicode(text): + def normalize_unicode(text: str) -> str: # normalize text by compatibility composition # see: https://en.wikipedia.org/wiki/Unicode_equivalence return unicodedata.normalize("NFKC", text) @@ -262,7 +297,9 @@ def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False): return value.strip() -def copy(source, destination, ignores=None): +def copy( + source: str, destination: str, ignores: Optional[Iterable[str]] = None +) -> None: """Recursively copy source into destination. If source is a file, destination has to be a file as well. @@ -334,7 +371,7 @@ def copy(source, destination, ignores=None): ) -def copy_file(source, destination): +def copy_file(source: str, destination: str) -> None: """Copy a file""" try: shutil.copyfile(source, destination) @@ -344,7 +381,7 @@ def copy_file(source, destination): ) -def clean_output_dir(path, retention): +def clean_output_dir(path: str, retention: Iterable[str]) -> None: """Remove all files from output directory except those in retention list""" if not os.path.exists(path): @@ -381,24 +418,24 @@ def clean_output_dir(path, retention): logger.error("Unable to delete %s, file type unknown", file) -def get_relative_path(path): +def get_relative_path(path: str) -> str: """Return the relative path from the given path to the root path.""" components = split_all(path) - if len(components) <= 1: + if components is None or len(components) <= 1: return os.curdir else: parents = [os.pardir] * (len(components) - 1) return os.path.join(*parents) -def path_to_url(path): +def path_to_url(path: str) -> str: """Return the URL corresponding to a given path.""" if path is not None: path = posixize_path(path) return path -def posixize_path(rel_path): +def posixize_path(rel_path: str) -> str: """Use '/' as path separator, so that source references, like '{static}/foo/bar.jpg' or 'extras/favicon.ico', will work on Windows as well as on Mac and Linux.""" @@ -427,20 +464,20 @@ class _HTMLWordTruncator(HTMLParser): _singlets = ("br", "col", "link", "base", "img", "param", "area", "hr", "input") class TruncationCompleted(Exception): - def __init__(self, truncate_at): + def __init__(self, truncate_at: int) -> None: super().__init__(truncate_at) self.truncate_at = truncate_at - def __init__(self, max_words): + def __init__(self, max_words: int) -> None: super().__init__(convert_charrefs=False) self.max_words = max_words self.words_found = 0 self.open_tags = [] self.last_word_end = None - self.truncate_at = None + self.truncate_at: Optional[int] = None - def feed(self, *args, **kwargs): + def feed(self, *args, **kwargs) -> None: try: super().feed(*args, **kwargs) except self.TruncationCompleted as exc: @@ -448,29 +485,29 @@ class _HTMLWordTruncator(HTMLParser): else: self.truncate_at = None - def getoffset(self): + def getoffset(self) -> int: line_start = 0 lineno, line_offset = self.getpos() for i in range(lineno - 1): line_start = self.rawdata.index("\n", line_start) + 1 return line_start + line_offset - def add_word(self, word_end): + def add_word(self, word_end: int) -> None: self.words_found += 1 self.last_word_end = None if self.words_found == self.max_words: raise self.TruncationCompleted(word_end) - def add_last_word(self): + def add_last_word(self) -> None: if self.last_word_end is not None: self.add_word(self.last_word_end) - def handle_starttag(self, tag, attrs): + def handle_starttag(self, tag: str, attrs: Any) -> None: self.add_last_word() if tag not in self._singlets: self.open_tags.insert(0, tag) - def handle_endtag(self, tag): + def handle_endtag(self, tag: str) -> None: self.add_last_word() try: i = self.open_tags.index(tag) @@ -481,7 +518,7 @@ class _HTMLWordTruncator(HTMLParser): # all unclosed intervening start tags with omitted end tags del self.open_tags[: i + 1] - def handle_data(self, data): + def handle_data(self, data: str) -> None: word_end = 0 offset = self.getoffset() @@ -499,7 +536,7 @@ class _HTMLWordTruncator(HTMLParser): if word_end < len(data): self.add_last_word() - def _handle_ref(self, name, char): + def _handle_ref(self, name: str, char: str) -> None: """ Called by handle_entityref() or handle_charref() when a ref like `—`, `—`, or `—` is found. @@ -543,7 +580,7 @@ class _HTMLWordTruncator(HTMLParser): else: self.add_last_word() - def handle_entityref(self, name): + def handle_entityref(self, name: str) -> None: """ Called when an entity ref like '—' is found @@ -556,7 +593,7 @@ class _HTMLWordTruncator(HTMLParser): char = "" self._handle_ref(name, char) - def handle_charref(self, name): + def handle_charref(self, name: str) -> None: """ Called when a char ref like '—' or '—' is found @@ -574,7 +611,7 @@ class _HTMLWordTruncator(HTMLParser): self._handle_ref("#" + name, char) -def truncate_html_words(s, num, end_text="…"): +def truncate_html_words(s: str, num: int, end_text: str = "…") -> str: """Truncates HTML to a certain number of words. (not counting tags and comments). Closes opened tags if they were correctly @@ -600,7 +637,10 @@ def truncate_html_words(s, num, end_text="…"): return out -def process_translations(content_list, translation_id=None): +def process_translations( + content_list: List[Content], + translation_id: Optional[Union[str, Collection[str]]] = None, +) -> Tuple[List[Content], List[Content]]: """Finds translations and returns them. For each content_list item, populates the 'translations' attribute, and @@ -658,7 +698,7 @@ def process_translations(content_list, translation_id=None): return index, translations -def get_original_items(items, with_str): +def get_original_items(items: List[Content], with_str: str) -> List[Content]: def _warn_source_paths(msg, items, *extra): args = [len(items)] args.extend(extra) @@ -698,7 +738,10 @@ def get_original_items(items, with_str): return original_items -def order_content(content_list, order_by="slug"): +def order_content( + content_list: List[Content], + order_by: Union[str, Callable[[Content], Any], None] = "slug", +) -> List[Content]: """Sorts content. order_by can be a string of an attribute or sorting function. If order_by @@ -758,7 +801,11 @@ def order_content(content_list, order_by="slug"): return content_list -def wait_for_changes(settings_file, reader_class, settings): +def wait_for_changes( + settings_file: str, + reader_class: Type["Readers"], + settings: "Settings", +): content_path = settings.get("PATH", "") theme_path = settings.get("THEME", "") ignore_files = { @@ -788,13 +835,15 @@ def wait_for_changes(settings_file, reader_class, settings): return next( watchfiles.watch( *watching_paths, - watch_filter=watchfiles.DefaultFilter(ignore_entity_patterns=ignore_files), + watch_filter=watchfiles.DefaultFilter(ignore_entity_patterns=ignore_files), # type: ignore rust_timeout=0, ) ) -def set_date_tzinfo(d, tz_name=None): +def set_date_tzinfo( + d: datetime.datetime, tz_name: Optional[str] = None +) -> datetime.datetime: """Set the timezone for dates that don't have tzinfo""" if tz_name and not d.tzinfo: timezone = ZoneInfo(tz_name) @@ -805,11 +854,11 @@ def set_date_tzinfo(d, tz_name=None): return d -def mkdir_p(path): +def mkdir_p(path: str) -> None: os.makedirs(path, exist_ok=True) -def split_all(path): +def split_all(path: Union[str, pathlib.Path, None]) -> Optional[Sequence[str]]: """Split a path into a list of components While os.path.split() splits a single component off the back of @@ -840,12 +889,12 @@ def split_all(path): ) -def path_to_file_url(path): +def path_to_file_url(path: str) -> str: """Convert file-system path to file:// URL""" return urllib.parse.urljoin("file://", urllib.request.pathname2url(path)) -def maybe_pluralize(count, singular, plural): +def maybe_pluralize(count: int, singular: str, plural: str) -> str: """ Returns a formatted string containing count and plural if count is not 1 Returns count and singular if count is 1 @@ -862,7 +911,9 @@ def maybe_pluralize(count, singular, plural): @contextmanager -def temporary_locale(temp_locale=None, lc_category=locale.LC_ALL): +def temporary_locale( + temp_locale: Optional[str] = None, lc_category: int = locale.LC_ALL +) -> Generator[None, None, None]: """ Enable code to run in a context with a temporary locale Resets the locale back when exiting context. From c36ab075269771834b5e05e4d1586d050743d457 Mon Sep 17 00:00:00 2001 From: MinchinWeb Date: Fri, 26 Jan 2024 16:31:22 -0700 Subject: [PATCH 15/22] write back to `._summary` --- pelican/contents.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pelican/contents.py b/pelican/contents.py index 27b8bbc3..e0629e2a 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -519,7 +519,7 @@ class Content: setattr(self, key.lower(), value) # _summary is an internal variable that some plugins may be writing to, - # so ensure changes to it are picked up + # so ensure changes to it are picked up, and write summary back to it if "summary" in self.settings["FORMATTED_FIELDS"]: if hasattr(self, "_summary"): self.metadata["summary"] = self._summary @@ -528,8 +528,6 @@ class Content: self.metadata["summary"] = self._update_content( self.metadata["summary"], self.get_siteurl() ) - - if hasattr(self, "_summary") and "summary" in self.metadata: self._summary = self.metadata["summary"] From f1f2ceccc757d9743dde39f626eccf05e3e9a5b0 Mon Sep 17 00:00:00 2001 From: MinchinWeb Date: Sat, 27 Jan 2024 10:47:54 -0700 Subject: [PATCH 16/22] Warning/error logging: be explicit in how the `stacklevel` variable is handled --- pelican/log.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/pelican/log.py b/pelican/log.py index 6a8fcdf1..ef49d280 100644 --- a/pelican/log.py +++ b/pelican/log.py @@ -85,19 +85,39 @@ class FatalLogger(LimitLogger): warnings_fatal = False errors_fatal = False - # adding `stacklevel=2` means that the displayed filename and line number - # will match the "original" calling location, rather than the wrapper here - def warning(self, *args, **kwargs): - if "stacklevel" not in kwargs.keys(): - kwargs["stacklevel"] = 2 - super().warning(*args, **kwargs) + def warning(self, *args, stacklevel=1, **kwargs): + """ + Displays a logging warning. + + Wrapping it here allows Pelican to filter warnings, and conditionally + make warnings fatal. + + Args: + stacklevel (int): the stacklevel that would be used to display the + calling location, except for this function. Adjusting the + stacklevel allows you to see the "true" calling location of the + warning, rather than this wrapper location. + """ + stacklevel += 1 + super().warning(*args, stacklevel=stacklevel, **kwargs) if FatalLogger.warnings_fatal: raise RuntimeError("Warning encountered") - def error(self, *args, **kwargs): - if "stacklevel" not in kwargs.keys(): - kwargs["stacklevel"] = 2 - super().error(*args, **kwargs) + def error(self, *args, stacklevel=1, **kwargs): + """ + Displays a logging error. + + Wrapping it here allows Pelican to filter errors, and conditionally + make errors non-fatal. + + Args: + stacklevel (int): the stacklevel that would be used to display the + calling location, except for this function. Adjusting the + stacklevel allows you to see the "true" calling location of the + error, rather than this wrapper location. + """ + stacklevel += 1 + super().error(*args, stacklevel=stacklevel, **kwargs) if FatalLogger.errors_fatal: raise RuntimeError("Error encountered") From 1f14606f8339385c5176ba05adca4664a3ad8868 Mon Sep 17 00:00:00 2001 From: MinchinWeb Date: Sat, 27 Jan 2024 10:51:35 -0700 Subject: [PATCH 17/22] On failing to load a plugin, show the stacktrace is pelican is run in debug mode --- pelican/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pelican/__init__.py b/pelican/__init__.py index 40251887..68f3e553 100644 --- a/pelican/__init__.py +++ b/pelican/__init__.py @@ -80,8 +80,14 @@ class Pelican: plugin.register() self.plugins.append(plugin) except Exception as e: - logger.error("Cannot register plugin `%s`\n%s", name, e, stacklevel=3) - print(e.stacktrace) + logger.error( + "Cannot register plugin `%s`\n%s", + name, + e, + stacklevel=2, + ) + if self.settings.get("DEBUG", False): + console.print_exception() self.settings["PLUGINS"] = [get_plugin_name(p) for p in self.plugins] From 7c7c9355b6c27122dbff6446cd366017f81eb0f2 Mon Sep 17 00:00:00 2001 From: Justin Mayer Date: Tue, 12 Mar 2024 11:57:46 +0100 Subject: [PATCH 18/22] Pin Ruff to major semantic version 0.1.x Upgrading to 0.3.0+ requires code style changes to the code base. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c8bbe985..eb1884a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ dev = [ "pytest-xdist>=3.4.0", "tox>=4.11.3", "invoke>=2.2.0", - "ruff>=0.1.5", + "ruff>=0.1.5,<0.2.0", "tomli>=2.0.1; python_version < \"3.11\"", ] From 74541381848f1d65ec64463469b5980ba0646617 Mon Sep 17 00:00:00 2001 From: Justin Mayer Date: Tue, 12 Mar 2024 12:05:09 +0100 Subject: [PATCH 19/22] Update `setup-python` & `setup-pdm` GitHub Actions --- .github/workflows/main.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cd646522..8cd63cc7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} cache: "pip" @@ -53,7 +53,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: pdm-project/setup-pdm@v3 + - uses: pdm-project/setup-pdm@v4 with: python-version: "3.11" cache: true @@ -71,7 +71,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: pdm-project/setup-pdm@v3 + - uses: pdm-project/setup-pdm@v4 with: python-version: "3.11" cache: true @@ -90,7 +90,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" cache: "pip" @@ -122,7 +122,7 @@ jobs: token: ${{ secrets.GH_TOKEN }} - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" From fabc40927750f52f11f27695e89ff76c0863a79f Mon Sep 17 00:00:00 2001 From: Justin Mayer Date: Tue, 12 Mar 2024 12:18:11 +0100 Subject: [PATCH 20/22] Update more GitHub Actions to resolve warnings --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8cd63cc7..4c0127df 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -64,7 +64,7 @@ jobs: - name: Run linters run: pdm lint --diff - name: Run pre-commit checks on all files - uses: pre-commit/action@v3.0.0 + uses: pre-commit/action@v3.0.1 build: name: Test build @@ -100,7 +100,7 @@ jobs: - name: Check run: tox -e docs - name: cache the docs for inspection - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: docs path: docs/_build/html/ From b87308cfaaa269c44784cda69855ecaf298f9f5e Mon Sep 17 00:00:00 2001 From: Justin Mayer Date: Wed, 27 Mar 2024 08:25:48 +0100 Subject: [PATCH 21/22] Update Ruff dependency version --- .pre-commit-config.yaml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 333bc3c0..d6cfac07 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,7 +14,7 @@ repos: - id: forbid-new-submodules - id: trailing-whitespace - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.5 + rev: v0.1.15 hooks: - id: ruff - id: ruff-format diff --git a/pyproject.toml b/pyproject.toml index eb1884a9..2f7d677c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ dev = [ "pytest-xdist>=3.4.0", "tox>=4.11.3", "invoke>=2.2.0", - "ruff>=0.1.5,<0.2.0", + "ruff>=0.1.15,<0.2.0", "tomli>=2.0.1; python_version < \"3.11\"", ] From 94bcd41f27d7f38a9dbd0847c6166e91a66d2090 Mon Sep 17 00:00:00 2001 From: Justin Mayer Date: Wed, 27 Mar 2024 08:26:55 +0100 Subject: [PATCH 22/22] Ignore Sphinx 7.2.x package install warnings Sphinx 7.2+ requires Python 3.9+, which results in annoying warnings since we still support Python 3.8.x. --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2f7d677c..3ca06df4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ changelog-header = "###############" version-header = "=" [tool.pdm] +ignore_package_warnings = ["sphinx"] [tool.pdm.scripts] docbuild = "invoke docbuild"