diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cd646522..4c0127df 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} cache: "pip" @@ -53,7 +53,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: pdm-project/setup-pdm@v3 + - uses: pdm-project/setup-pdm@v4 with: python-version: "3.11" cache: true @@ -64,14 +64,14 @@ jobs: - name: Run linters run: pdm lint --diff - name: Run pre-commit checks on all files - uses: pre-commit/action@v3.0.0 + uses: pre-commit/action@v3.0.1 build: name: Test build runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: pdm-project/setup-pdm@v3 + - uses: pdm-project/setup-pdm@v4 with: python-version: "3.11" cache: true @@ -90,7 +90,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" cache: "pip" @@ -100,7 +100,7 @@ jobs: - name: Check run: tox -e docs - name: cache the docs for inspection - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: docs path: docs/_build/html/ @@ -122,7 +122,7 @@ jobs: token: ${{ secrets.GH_TOKEN }} - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 46caab56..5bda568b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,7 +14,7 @@ repos: - id: forbid-new-submodules - id: trailing-whitespace - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.5 + rev: v0.1.15 hooks: - id: ruff args: ["--fix"] diff --git a/docs/content.rst b/docs/content.rst index cacacea9..4277b838 100644 --- a/docs/content.rst +++ b/docs/content.rst @@ -439,8 +439,8 @@ For **Markdown**, one must rely on an extension. For example, using the `mdx_inc Importing an existing site ========================== -It is possible to import your site from WordPress, Tumblr, Dotclear, and RSS -feeds using a simple script. See :ref:`import`. +It is possible to import your site from several other blogging sites +(like WordPress, Tumblr, ..) using a simple script. See :ref:`import`. Translations ============ @@ -631,7 +631,7 @@ are not included by default in tag, category, and author indexes, nor in the main article feed. This has the effect of creating an "unlisted" post. .. _W3C ISO 8601: https://www.w3.org/TR/NOTE-datetime -.. _AsciiDoc: https://www.methods.co.nz/asciidoc/ +.. _AsciiDoc: https://asciidoc.org .. _Pelican Plugins: https://github.com/pelican-plugins .. _pelican-plugins: https://github.com/getpelican/pelican-plugins .. _Python-Markdown: https://github.com/Python-Markdown/markdown diff --git a/docs/importer.rst b/docs/importer.rst index 997a4632..49d6db24 100644 --- a/docs/importer.rst +++ b/docs/importer.rst @@ -11,6 +11,7 @@ software to reStructuredText or Markdown. The supported import formats are: - Blogger XML export - Dotclear export +- Medium export - Tumblr API - WordPress XML export - RSS/Atom feed @@ -26,6 +27,12 @@ not be converted (as Pelican also supports Markdown). manually, or use a plugin such as `More Categories`_ that enables multiple categories per article. +.. note:: + + Imported pages may contain links to images that still point to the original site. + So you might want to download those images into your local content and manually + re-link them from the relevant pages of your site. + Dependencies ============ @@ -65,6 +72,7 @@ Optional arguments -h, --help Show this help message and exit --blogger Blogger XML export (default: False) --dotclear Dotclear export (default: False) + --medium Medium export (default: False) --tumblr Tumblr API (default: False) --wpfile WordPress XML export (default: False) --feed Feed to parse (default: False) @@ -80,8 +88,7 @@ Optional arguments (default: False) --filter-author Import only post from the specified author --strip-raw Strip raw HTML code that can't be converted to markup - such as flash embeds or iframes (wordpress import - only) (default: False) + such as flash embeds or iframes (default: False) --wp-custpost Put wordpress custom post types in directories. If used with --dir-cat option directories will be created as "/post_type/category/" (wordpress import only) @@ -113,6 +120,14 @@ For Dotclear:: $ pelican-import --dotclear -o ~/output ~/backup.txt +For Medium:: + + $ pelican-import --medium -o ~/output ~/medium-export/posts/ + +The Medium export is a zip file. Unzip it, and point this tool to the +"posts" subdirectory. For more information on how to export, see +https://help.medium.com/hc/en-us/articles/115004745787-Export-your-account-data. + For Tumblr:: $ pelican-import --tumblr -o ~/output --blogname= @@ -121,6 +136,15 @@ For WordPress:: $ pelican-import --wpfile -o ~/output ~/posts.xml +For Medium (an example of using an RSS feed): + + $ python -m pip install feedparser + $ pelican-import --feed https://medium.com/feed/@username + +.. note:: + + The RSS feed may only return the most recent posts — not all of them. + Tests ===== diff --git a/pelican/__init__.py b/pelican/__init__.py index 92dc90a8..aef4b124 100644 --- a/pelican/__init__.py +++ b/pelican/__init__.py @@ -80,7 +80,14 @@ class Pelican: plugin.register() self.plugins.append(plugin) except Exception as e: - logger.error("Cannot register plugin `%s`\n%s", name, e) + logger.error( + "Cannot register plugin `%s`\n%s", + name, + e, + stacklevel=2, + ) + if self.settings.get("DEBUG", False): + console.print_exception() self.settings["PLUGINS"] = [get_plugin_name(p) for p in self.plugins] @@ -120,12 +127,15 @@ class Pelican: if hasattr(p, "generate_context"): p.generate_context() + # for plugins that create/edit the summary + logger.debug("Signal all_generators_finalized.send()") + signals.all_generators_finalized.send(generators) + + # update links in the summary, etc for p in generators: if hasattr(p, "refresh_metadata_intersite_links"): p.refresh_metadata_intersite_links() - signals.all_generators_finalized.send(generators) - writer = self._get_writer() for p in generators: diff --git a/pelican/contents.py b/pelican/contents.py index 5e9ba089..9532c523 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -6,7 +6,8 @@ import os import re from datetime import timezone from html import unescape -from urllib.parse import unquote, urljoin, urlparse, urlunparse +from typing import Any, Dict, Optional, Set, Tuple +from urllib.parse import ParseResult, unquote, urljoin, urlparse, urlunparse try: from zoneinfo import ZoneInfo @@ -15,7 +16,7 @@ except ModuleNotFoundError: from pelican.plugins import signals -from pelican.settings import DEFAULT_CONFIG +from pelican.settings import DEFAULT_CONFIG, Settings # Import these so that they're available when you import from pelican.contents. from pelican.urlwrappers import Author, Category, Tag, URLWrapper # NOQA @@ -44,12 +45,20 @@ class Content: """ + default_template: Optional[str] = None + mandatory_properties: Tuple[str, ...] = () + @deprecated_attribute(old="filename", new="source_path", since=(3, 2, 0)) def filename(): return None def __init__( - self, content, metadata=None, settings=None, source_path=None, context=None + self, + content: str, + metadata: Optional[Dict[str, Any]] = None, + settings: Optional[Settings] = None, + source_path: Optional[str] = None, + context: Optional[Dict[Any, Any]] = None, ): if metadata is None: metadata = {} @@ -156,10 +165,10 @@ class Content: signals.content_object_init.send(self) - def __str__(self): + def __str__(self) -> str: return self.source_path or repr(self) - def _has_valid_mandatory_properties(self): + def _has_valid_mandatory_properties(self) -> bool: """Test mandatory properties are set.""" for prop in self.mandatory_properties: if not hasattr(self, prop): @@ -169,7 +178,7 @@ class Content: return False return True - def _has_valid_save_as(self): + def _has_valid_save_as(self) -> bool: """Return true if save_as doesn't write outside output path, false otherwise.""" try: @@ -190,7 +199,7 @@ class Content: return True - def _has_valid_status(self): + def _has_valid_status(self) -> bool: if hasattr(self, "allowed_statuses"): if self.status not in self.allowed_statuses: logger.error( @@ -204,7 +213,7 @@ class Content: # if undefined we allow all return True - def is_valid(self): + def is_valid(self) -> bool: """Validate Content""" # Use all() to not short circuit and get results of all validations return all( @@ -216,7 +225,7 @@ class Content: ) @property - def url_format(self): + def url_format(self) -> Dict[str, Any]: """Returns the URL, formatted with the proper values""" metadata = copy.copy(self.metadata) path = self.metadata.get("path", self.get_relative_source_path()) @@ -232,19 +241,19 @@ class Content: ) return metadata - def _expand_settings(self, key, klass=None): + def _expand_settings(self, key: str, klass: Optional[str] = None) -> str: if not klass: klass = self.__class__.__name__ fq_key = (f"{klass}_{key}").upper() return str(self.settings[fq_key]).format(**self.url_format) - def get_url_setting(self, key): + def get_url_setting(self, key: str) -> str: if hasattr(self, "override_" + key): return getattr(self, "override_" + key) key = key if self.in_default_lang else "lang_%s" % key return self._expand_settings(key) - def _link_replacer(self, siteurl, m): + def _link_replacer(self, siteurl: str, m: re.Match) -> str: what = m.group("what") value = urlparse(m.group("value")) path = value.path @@ -272,15 +281,15 @@ class Content: # XXX Put this in a different location. if what in {"filename", "static", "attach"}: - def _get_linked_content(key, url): + def _get_linked_content(key: str, url: ParseResult) -> Optional[Content]: nonlocal value - def _find_path(path): + def _find_path(path: str) -> Optional[Content]: if path.startswith("/"): path = path[1:] else: # relative to the source path of this content - path = self.get_relative_source_path( + path = self.get_relative_source_path( # type: ignore os.path.join(self.relative_dir, path) ) return self._context[key].get(path, None) @@ -324,7 +333,7 @@ class Content: linked_content = _get_linked_content(key, value) if linked_content: if what == "attach": - linked_content.attach_to(self) + linked_content.attach_to(self) # type: ignore origin = joiner(siteurl, linked_content.url) origin = origin.replace("\\", "/") # for Windows paths. else: @@ -359,7 +368,7 @@ class Content: return "".join((m.group("markup"), m.group("quote"), origin, m.group("quote"))) - def _get_intrasite_link_regex(self): + def _get_intrasite_link_regex(self) -> re.Pattern: intrasite_link_regex = self.settings["INTRASITE_LINK_REGEX"] regex = rf""" (?P<[^\>]+ # match tag with all url-value attributes @@ -370,7 +379,7 @@ class Content: (?P=quote)""" return re.compile(regex, re.X) - def _update_content(self, content, siteurl): + def _update_content(self, content: str, siteurl: str) -> str: """Update the content attribute. Change all the relative paths of the content to relative paths @@ -386,7 +395,7 @@ class Content: hrefs = self._get_intrasite_link_regex() return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content) - def get_static_links(self): + def get_static_links(self) -> Set[str]: static_links = set() hrefs = self._get_intrasite_link_regex() for m in hrefs.finditer(self._content): @@ -402,15 +411,15 @@ class Content: path = self.get_relative_source_path( os.path.join(self.relative_dir, path) ) - path = path.replace("%20", " ") + path = path.replace("%20", " ") # type: ignore static_links.add(path) return static_links - def get_siteurl(self): + def get_siteurl(self) -> str: return self._context.get("localsiteurl", "") @memoized - def get_content(self, siteurl): + def get_content(self, siteurl: str) -> str: if hasattr(self, "_get_content"): content = self._get_content() else: @@ -418,11 +427,11 @@ class Content: return self._update_content(content, siteurl) @property - def content(self): + def content(self) -> str: return self.get_content(self.get_siteurl()) @memoized - def get_summary(self, siteurl): + def get_summary(self, siteurl: str) -> str: """Returns the summary of an article. This is based on the summary metadata if set, otherwise truncate the @@ -441,10 +450,10 @@ class Content: ) @property - def summary(self): + def summary(self) -> str: return self.get_summary(self.get_siteurl()) - def _get_summary(self): + def _get_summary(self) -> str: """deprecated function to access summary""" logger.warning( @@ -454,33 +463,35 @@ class Content: return self.summary @summary.setter - def summary(self, value): + def summary(self, value: str): """Dummy function""" @property - def status(self): + def status(self) -> str: return self._status @status.setter - def status(self, value): + def status(self, value: str) -> None: # TODO maybe typecheck self._status = value.lower() @property - def url(self): + def url(self) -> str: return self.get_url_setting("url") @property - def save_as(self): + def save_as(self) -> str: return self.get_url_setting("save_as") - def _get_template(self): + def _get_template(self) -> str: if hasattr(self, "template") and self.template is not None: return self.template else: return self.default_template - def get_relative_source_path(self, source_path=None): + def get_relative_source_path( + self, source_path: Optional[str] = None + ) -> Optional[str]: """Return the relative path (from the content path) to the given source_path. @@ -500,7 +511,7 @@ class Content: ) @property - def relative_dir(self): + def relative_dir(self) -> str: return posixize_path( os.path.dirname( os.path.relpath( @@ -510,7 +521,7 @@ class Content: ) ) - def refresh_metadata_intersite_links(self): + def refresh_metadata_intersite_links(self) -> None: for key in self.settings["FORMATTED_FIELDS"]: if key in self.metadata and key != "summary": value = self._update_content(self.metadata[key], self.get_siteurl()) @@ -518,13 +529,16 @@ class Content: setattr(self, key.lower(), value) # _summary is an internal variable that some plugins may be writing to, - # so ensure changes to it are picked up - if ( - "summary" in self.settings["FORMATTED_FIELDS"] - and "summary" in self.metadata - ): - self._summary = self._update_content(self._summary, self.get_siteurl()) - self.metadata["summary"] = self._summary + # so ensure changes to it are picked up, and write summary back to it + if "summary" in self.settings["FORMATTED_FIELDS"]: + if hasattr(self, "_summary"): + self.metadata["summary"] = self._summary + + if "summary" in self.metadata: + self.metadata["summary"] = self._update_content( + self.metadata["summary"], self.get_siteurl() + ) + self._summary = self.metadata["summary"] class Page(Content): @@ -533,7 +547,7 @@ class Page(Content): default_status = "published" default_template = "page" - def _expand_settings(self, key): + def _expand_settings(self, key: str) -> str: klass = "draft_page" if self.status == "draft" else None return super()._expand_settings(key, klass) @@ -560,7 +574,7 @@ class Article(Content): if not hasattr(self, "date") and self.status == "draft": self.date = datetime.datetime.max.replace(tzinfo=self.timezone) - def _expand_settings(self, key): + def _expand_settings(self, key: str) -> str: klass = "draft" if self.status == "draft" else "article" return super()._expand_settings(key, klass) @@ -570,7 +584,7 @@ class Static(Content): default_status = "published" default_template = None - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self._output_location_referenced = False @@ -587,18 +601,18 @@ class Static(Content): return None @property - def url(self): + def url(self) -> str: # Note when url has been referenced, so we can avoid overriding it. self._output_location_referenced = True return super().url @property - def save_as(self): + def save_as(self) -> str: # Note when save_as has been referenced, so we can avoid overriding it. self._output_location_referenced = True return super().save_as - def attach_to(self, content): + def attach_to(self, content: Content) -> None: """Override our output directory with that of the given content object.""" # Determine our file's new output path relative to the linking @@ -623,7 +637,7 @@ class Static(Content): new_url = path_to_url(new_save_as) - def _log_reason(reason): + def _log_reason(reason: str) -> None: logger.warning( "The {attach} link in %s cannot relocate " "%s because %s. Falling back to " diff --git a/pelican/generators.py b/pelican/generators.py index 3b5ca9e4..076c8d38 100644 --- a/pelican/generators.py +++ b/pelican/generators.py @@ -384,8 +384,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["CATEGORY_FEED_ATOM"]).format(slug=cat.slug), self.settings.get( "CATEGORY_FEED_ATOM_URL", - str(self.settings["CATEGORY_FEED_ATOM"]).format(slug=cat.slug), - ), + str(self.settings["CATEGORY_FEED_ATOM"]), + ).format(slug=cat.slug), feed_title=cat.name, ) @@ -396,8 +396,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["CATEGORY_FEED_RSS"]).format(slug=cat.slug), self.settings.get( "CATEGORY_FEED_RSS_URL", - str(self.settings["CATEGORY_FEED_RSS"]).format(slug=cat.slug), - ), + str(self.settings["CATEGORY_FEED_RSS"]), + ).format(slug=cat.slug), feed_title=cat.name, feed_type="rss", ) @@ -410,8 +410,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["AUTHOR_FEED_ATOM"]).format(slug=auth.slug), self.settings.get( "AUTHOR_FEED_ATOM_URL", - str(self.settings["AUTHOR_FEED_ATOM"]).format(slug=auth.slug), - ), + str(self.settings["AUTHOR_FEED_ATOM"]), + ).format(slug=auth.slug), feed_title=auth.name, ) @@ -422,8 +422,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["AUTHOR_FEED_RSS"]).format(slug=auth.slug), self.settings.get( "AUTHOR_FEED_RSS_URL", - str(self.settings["AUTHOR_FEED_RSS"]).format(slug=auth.slug), - ), + str(self.settings["AUTHOR_FEED_RSS"]), + ).format(slug=auth.slug), feed_title=auth.name, feed_type="rss", ) @@ -437,8 +437,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["TAG_FEED_ATOM"]).format(slug=tag.slug), self.settings.get( "TAG_FEED_ATOM_URL", - str(self.settings["TAG_FEED_ATOM"]).format(slug=tag.slug), - ), + str(self.settings["TAG_FEED_ATOM"]), + ).format(slug=tag.slug), feed_title=tag.name, ) @@ -449,8 +449,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["TAG_FEED_RSS"]).format(slug=tag.slug), self.settings.get( "TAG_FEED_RSS_URL", - str(self.settings["TAG_FEED_RSS"]).format(slug=tag.slug), - ), + str(self.settings["TAG_FEED_RSS"]), + ).format(slug=tag.slug), feed_title=tag.name, feed_type="rss", ) @@ -471,10 +471,8 @@ class ArticlesGenerator(CachingGenerator): str(self.settings["TRANSLATION_FEED_ATOM"]).format(lang=lang), self.settings.get( "TRANSLATION_FEED_ATOM_URL", - str(self.settings["TRANSLATION_FEED_ATOM"]).format( - lang=lang - ), - ), + str(self.settings["TRANSLATION_FEED_ATOM"]), + ).format(lang=lang), ) if self.settings.get("TRANSLATION_FEED_RSS"): writer.write_feed( diff --git a/pelican/log.py b/pelican/log.py index 7eb4556d..a1193cfa 100644 --- a/pelican/log.py +++ b/pelican/log.py @@ -85,13 +85,39 @@ class FatalLogger(LimitLogger): warnings_fatal = False errors_fatal = False - def warning(self, *args, **kwargs): - super().warning(*args, **kwargs) + def warning(self, *args, stacklevel=1, **kwargs): + """ + Displays a logging warning. + + Wrapping it here allows Pelican to filter warnings, and conditionally + make warnings fatal. + + Args: + stacklevel (int): the stacklevel that would be used to display the + calling location, except for this function. Adjusting the + stacklevel allows you to see the "true" calling location of the + warning, rather than this wrapper location. + """ + stacklevel += 1 + super().warning(*args, stacklevel=stacklevel, **kwargs) if FatalLogger.warnings_fatal: raise RuntimeError("Warning encountered") - def error(self, *args, **kwargs): - super().error(*args, **kwargs) + def error(self, *args, stacklevel=1, **kwargs): + """ + Displays a logging error. + + Wrapping it here allows Pelican to filter errors, and conditionally + make errors non-fatal. + + Args: + stacklevel (int): the stacklevel that would be used to display the + calling location, except for this function. Adjusting the + stacklevel allows you to see the "true" calling location of the + error, rather than this wrapper location. + """ + stacklevel += 1 + super().error(*args, stacklevel=stacklevel, **kwargs) if FatalLogger.errors_fatal: raise RuntimeError("Error encountered") diff --git a/pelican/settings.py b/pelican/settings.py index 9d2fef25..1d914ae8 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -8,11 +8,13 @@ import re import sys from os.path import isabs from pathlib import Path +from types import ModuleType +from typing import Any, Dict, Optional from pelican.log import LimitFilter -def load_source(name, path): +def load_source(name: str, path: str) -> ModuleType: spec = importlib.util.spec_from_file_location(name, path) mod = importlib.util.module_from_spec(spec) sys.modules[name] = mod @@ -22,6 +24,8 @@ def load_source(name, path): logger = logging.getLogger(__name__) +Settings = Dict[str, Any] + DEFAULT_THEME = os.path.join( os.path.dirname(os.path.abspath(__file__)), "themes", "notmyidea" ) @@ -177,7 +181,9 @@ DEFAULT_CONFIG = { PYGMENTS_RST_OPTIONS = None -def read_settings(path=None, override=None): +def read_settings( + path: Optional[str] = None, override: Optional[Settings] = None +) -> Settings: settings = override or {} if path: @@ -221,7 +227,7 @@ def read_settings(path=None, override=None): return settings -def get_settings_from_module(module=None): +def get_settings_from_module(module: Optional[ModuleType] = None) -> Settings: """Loads settings from a module, returns a dictionary.""" context = {} @@ -230,7 +236,7 @@ def get_settings_from_module(module=None): return context -def get_settings_from_file(path): +def get_settings_from_file(path: str) -> Settings: """Loads settings from a file path, returning a dict.""" name, ext = os.path.splitext(os.path.basename(path)) @@ -238,7 +244,7 @@ def get_settings_from_file(path): return get_settings_from_module(module) -def get_jinja_environment(settings): +def get_jinja_environment(settings: Settings) -> Settings: """Sets the environment for Jinja""" jinja_env = settings.setdefault( @@ -253,7 +259,7 @@ def get_jinja_environment(settings): return settings -def _printf_s_to_format_field(printf_string, format_field): +def _printf_s_to_format_field(printf_string: str, format_field: str) -> str: """Tries to replace %s with {format_field} in the provided printf_string. Raises ValueError in case of failure. """ @@ -267,7 +273,7 @@ def _printf_s_to_format_field(printf_string, format_field): return result -def handle_deprecated_settings(settings): +def handle_deprecated_settings(settings: Settings) -> Settings: """Converts deprecated settings and issues warnings. Issues an exception if both old and new setting is specified. """ @@ -564,7 +570,7 @@ def handle_deprecated_settings(settings): return settings -def configure_settings(settings): +def configure_settings(settings: Settings) -> Settings: """Provide optimizations, error checking, and warnings for the given settings. Also, specify the log messages to be ignored. diff --git a/pelican/tests/content/medium_post_content.txt b/pelican/tests/content/medium_post_content.txt new file mode 100644 index 00000000..5e21881c --- /dev/null +++ b/pelican/tests/content/medium_post_content.txt @@ -0,0 +1,4 @@ + +

Title header

A paragraph of content.

Paragraph number two.

A list:

  1. One.
  2. Two.
  3. Three.

A link: link text.

Header 2

A block quote:

quote words strong words

after blockquote

A figure caption.

A final note: Cross-Validated has sometimes been helpful.


Next: Next post +

+

By User Name on .

Canonical link

Exported from Medium on December 1, 2023.

diff --git a/pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html b/pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html new file mode 100644 index 00000000..02d272dc --- /dev/null +++ b/pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html @@ -0,0 +1,72 @@ +A title diff --git a/pelican/tests/test_generators.py b/pelican/tests/test_generators.py index 263579ea..920d9061 100644 --- a/pelican/tests/test_generators.py +++ b/pelican/tests/test_generators.py @@ -264,6 +264,7 @@ class TestArticlesGenerator(unittest.TestCase): def test_generate_context(self): articles_expected = [ + ["A title", "published", "medium_posts", "article"], ["Article title", "published", "Default", "article"], [ "Article with markdown and summary metadata multi", @@ -391,13 +392,24 @@ class TestArticlesGenerator(unittest.TestCase): # terms of process order will define the name for that category categories = [cat.name for cat, _ in self.generator.categories] categories_alternatives = ( - sorted(["Default", "TestCategory", "Yeah", "test", "指導書"]), - sorted(["Default", "TestCategory", "yeah", "test", "指導書"]), + sorted( + ["Default", "TestCategory", "medium_posts", "Yeah", "test", "指導書"] + ), + sorted( + ["Default", "TestCategory", "medium_posts", "yeah", "test", "指導書"] + ), ) self.assertIn(sorted(categories), categories_alternatives) # test for slug categories = [cat.slug for cat, _ in self.generator.categories] - categories_expected = ["default", "testcategory", "yeah", "test", "zhi-dao-shu"] + categories_expected = [ + "default", + "testcategory", + "medium_posts", + "yeah", + "test", + "zhi-dao-shu", + ] self.assertEqual(sorted(categories), sorted(categories_expected)) def test_do_not_use_folder_as_category(self): @@ -549,7 +561,8 @@ class TestArticlesGenerator(unittest.TestCase): granularity: {period["period"] for period in periods} for granularity, periods in period_archives.items() } - expected = {"year": {(1970,), (2010,), (2012,), (2014,)}} + self.maxDiff = None + expected = {"year": {(1970,), (2010,), (2012,), (2014,), (2017,)}} self.assertEqual(expected, abbreviated_archives) # Month archives enabled: @@ -570,7 +583,7 @@ class TestArticlesGenerator(unittest.TestCase): for granularity, periods in period_archives.items() } expected = { - "year": {(1970,), (2010,), (2012,), (2014,)}, + "year": {(1970,), (2010,), (2012,), (2014,), (2017,)}, "month": { (1970, "January"), (2010, "December"), @@ -578,6 +591,7 @@ class TestArticlesGenerator(unittest.TestCase): (2012, "November"), (2012, "October"), (2014, "February"), + (2017, "April"), }, } self.assertEqual(expected, abbreviated_archives) @@ -602,7 +616,7 @@ class TestArticlesGenerator(unittest.TestCase): for granularity, periods in period_archives.items() } expected = { - "year": {(1970,), (2010,), (2012,), (2014,)}, + "year": {(1970,), (2010,), (2012,), (2014,), (2017,)}, "month": { (1970, "January"), (2010, "December"), @@ -610,6 +624,7 @@ class TestArticlesGenerator(unittest.TestCase): (2012, "November"), (2012, "October"), (2014, "February"), + (2017, "April"), }, "day": { (1970, "January", 1), @@ -619,6 +634,7 @@ class TestArticlesGenerator(unittest.TestCase): (2012, "October", 30), (2012, "October", 31), (2014, "February", 9), + (2017, "April", 21), }, } self.assertEqual(expected, abbreviated_archives) @@ -836,8 +852,12 @@ class TestArticlesGenerator(unittest.TestCase): categories = sorted([category.name for category, _ in generator.categories]) categories_expected = [ - sorted(["Default", "TestCategory", "yeah", "test", "指導書"]), - sorted(["Default", "TestCategory", "Yeah", "test", "指導書"]), + sorted( + ["Default", "TestCategory", "medium_posts", "yeah", "test", "指導書"] + ), + sorted( + ["Default", "TestCategory", "medium_posts", "Yeah", "test", "指導書"] + ), ] self.assertIn(categories, categories_expected) @@ -864,6 +884,7 @@ class TestArticlesGenerator(unittest.TestCase): generator.generate_context() expected = [ + "A title", "An Article With Code Block To Test Typogrify Ignore", "Article title", "Article with Nonconformant HTML meta tags", diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index d1aeded0..7cc10bca 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -21,6 +21,10 @@ from pelican.tools.pelican_import import ( get_attachments, tumblr2fields, wp2fields, + mediumpost2fields, + mediumposts2fields, + strip_medium_post_content, + medium_slug, ) from pelican.utils import path_to_file_url, slugify @@ -706,3 +710,82 @@ class TestTumblrImporter(TestCaseWithCLocale): posts, posts, ) + + +class TestMediumImporter(TestCaseWithCLocale): + def setUp(self): + super().setUp() + self.test_content_root = "pelican/tests/content" + # The content coming out of parsing is similar, but not the same. + # Beautiful soup rearranges the order of attributes, for example. + # So, we keep a copy of the content for the test. + content_filename = f"{self.test_content_root}/medium_post_content.txt" + with open(content_filename, encoding="utf-8") as the_content_file: + # Many editors and scripts add a final newline, so live with that + # in our test + the_content = the_content_file.read() + assert the_content[-1] == "\n" + the_content = the_content[:-1] + self.post_tuple = ( + "A title", + the_content, + # slug: + "2017-04-21-medium-post", + "2017-04-21 17:11", + "User Name", + None, + (), + "published", + "article", + "html", + ) + + def test_mediumpost2field(self): + """Parse one post""" + post_filename = f"{self.test_content_root}/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html" + val = mediumpost2fields(post_filename) + self.assertEqual(self.post_tuple, val, val) + + def test_mediumposts2field(self): + """Parse all posts in an export directory""" + posts = [ + fields + for fields in mediumposts2fields(f"{self.test_content_root}/medium_posts") + ] + self.assertEqual(1, len(posts)) + self.assertEqual(self.post_tuple, posts[0]) + + def test_strip_content(self): + """Strip out unhelpful tags""" + html_doc = ( + "
This keeps lots of tags, but not " + "the
section
tags
" + ) + soup = BeautifulSoup(html_doc, "html.parser") + self.assertEqual( + "This keeps lots of tags, but not the section tags", + strip_medium_post_content(soup), + ) + + def test_medium_slug(self): + # Remove hex stuff at the end + self.assertEqual( + "2017-04-27_A-long-title", + medium_slug( + "medium-export/posts/2017-04-27_A-long-title--2971442227dd.html" + ), + ) + # Remove "--DRAFT" at the end + self.assertEqual( + "2017-04-27_A-long-title", + medium_slug("medium-export/posts/2017-04-27_A-long-title--DRAFT.html"), + ) + # Remove both (which happens) + self.assertEqual( + "draft_How-to-do", medium_slug("draft_How-to-do--DRAFT--87225c81dddd.html") + ) + # If no hex stuff, leave it alone + self.assertEqual( + "2017-04-27_A-long-title", + medium_slug("medium-export/posts/2017-04-27_A-long-title.html"), + ) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index a02fabf1..3e1f31db 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -15,6 +15,8 @@ from urllib.error import URLError from urllib.parse import quote, urlparse, urlsplit, urlunsplit from urllib.request import urlretrieve +import dateutil.parser + # because logging.setLoggerClass has to be called before logging.getLogger from pelican.log import init from pelican.settings import DEFAULT_CONFIG @@ -113,19 +115,25 @@ def decode_wp_content(content, br=True): return content -def xml_to_soup(xml): - """Opens an xml file""" +def _import_bs4(): + """Import and return bs4, otherwise sys.exit.""" try: - from bs4 import BeautifulSoup + import bs4 except ImportError: error = ( 'Missing dependency "BeautifulSoup4" and "lxml" required to ' "import XML files." ) sys.exit(error) + return bs4 + + +def file_to_soup(xml, features="xml"): + """Reads a file, returns soup.""" + bs4 = _import_bs4() with open(xml, encoding="utf-8") as infile: xmlfile = infile.read() - soup = BeautifulSoup(xmlfile, "xml") + soup = bs4.BeautifulSoup(xmlfile, features) return soup @@ -139,7 +147,7 @@ def get_filename(post_name, post_id): def wp2fields(xml, wp_custpost=False): """Opens a wordpress XML file, and yield Pelican fields""" - soup = xml_to_soup(xml) + soup = file_to_soup(xml) items = soup.rss.channel.findAll("item") for item in items: if item.find("status").string in ["publish", "draft"]: @@ -209,7 +217,7 @@ def wp2fields(xml, wp_custpost=False): def blogger2fields(xml): """Opens a blogger XML file, and yield Pelican fields""" - soup = xml_to_soup(xml) + soup = file_to_soup(xml) entries = soup.feed.findAll("entry") for entry in entries: raw_kind = entry.find( @@ -535,6 +543,133 @@ def tumblr2fields(api_key, blogname): posts = _get_tumblr_posts(api_key, blogname, offset) +def strip_medium_post_content(soup) -> str: + """Strip some tags and attributes from medium post content. + + For example, the 'section' and 'div' tags cause trouble while rendering. + + The problem with these tags is you can get a section divider (--------------) + that is not between two pieces of content. For example: + + Some text. + + .. container:: section-divider + + -------------- + + .. container:: section-content + + More content. + + In this case, pandoc complains: "Unexpected section title or transition." + + Also, the "id" and "name" attributes in tags cause similar problems. They show + up in .rst as extra junk that separates transitions. + """ + # Remove tags + # section and div cause problems + # footer also can cause problems, and has nothing we want to keep + # See https://stackoverflow.com/a/8439761 + invalid_tags = ["section", "div", "footer"] + for tag in invalid_tags: + for match in soup.findAll(tag): + match.replaceWithChildren() + + # Remove attributes + # See https://stackoverflow.com/a/9045719 + invalid_attributes = ["name", "id", "class"] + bs4 = _import_bs4() + for tag in soup.descendants: + if isinstance(tag, bs4.element.Tag): + tag.attrs = { + key: value + for key, value in tag.attrs.items() + if key not in invalid_attributes + } + + # Get the string of all content, keeping other tags + all_content = "".join(str(element) for element in soup.contents) + return all_content + + +def mediumpost2fields(filepath: str) -> tuple: + """Take an HTML post from a medium export, return Pelican fields.""" + + soup = file_to_soup(filepath, "html.parser") + if not soup: + raise ValueError(f"{filepath} could not be parsed by beautifulsoup") + kind = "article" + + content = soup.find("section", class_="e-content") + if not content: + raise ValueError(f"{filepath}: Post has no content") + + title = soup.find("title").string or "" + + raw_date = soup.find("time", class_="dt-published") + date = None + if raw_date: + # This datetime can include timezone, e.g., "2017-04-21T17:11:55.799Z" + # python before 3.11 can't parse the timezone using datetime.fromisoformat + # See also https://docs.python.org/3.10/library/datetime.html#datetime.datetime.fromisoformat + # "This does not support parsing arbitrary ISO 8601 strings" + # So, we use dateutil.parser, which can handle it. + date_object = dateutil.parser.parse(raw_date.attrs["datetime"]) + date = date_object.strftime("%Y-%m-%d %H:%M") + status = "published" + else: + status = "draft" + author = soup.find("a", class_="p-author h-card") + if author: + author = author.string + + # Now that we're done with classes, we can strip the content + content = strip_medium_post_content(content) + + # medium HTML export doesn't have tag or category + # RSS feed has tags, but it doesn't have all the posts. + tags = () + + slug = medium_slug(filepath) + + # TODO: make the fields a python dataclass + return ( + title, + content, + slug, + date, + author, + None, + tags, + status, + kind, + "html", + ) + + +def medium_slug(filepath: str) -> str: + """Make the filepath of a medium exported file into a slug.""" + # slug: filename without extension + slug = os.path.basename(filepath) + slug = os.path.splitext(slug)[0] + # A medium export filename looks like date_-title-...html + # But, RST doesn't like "_-" (see https://github.com/sphinx-doc/sphinx/issues/4350) + # so get rid of it + slug = slug.replace("_-", "-") + # drop the hex string medium puts on the end of the filename, why keep it. + # e.g., "-a8a8a8a8" or "---a9a9a9a9" + # also: drafts don't need "--DRAFT" + slug = re.sub(r"((-)+([0-9a-f]+|DRAFT))+$", "", slug) + return slug + + +def mediumposts2fields(medium_export_dir: str): + """Take HTML posts in a medium export directory, and yield Pelican fields.""" + for file in os.listdir(medium_export_dir): + filename = os.fsdecode(file) + yield mediumpost2fields(os.path.join(medium_export_dir, filename)) + + def feed2fields(file): """Read a feed and yield pelican fields""" import feedparser @@ -710,7 +845,7 @@ def get_attachments(xml): """returns a dictionary of posts that have attachments with a list of the attachment_urls """ - soup = xml_to_soup(xml) + soup = file_to_soup(xml) items = soup.rss.channel.findAll("item") names = {} attachments = [] @@ -836,6 +971,9 @@ def fields2pelican( posts_require_pandoc.append(filename) slug = not disable_slugs and filename or None + assert slug is None or filename == os.path.basename( + filename + ), f"filename is not a basename: {filename}" if wp_attach and attachments: try: @@ -983,6 +1121,9 @@ def main(): parser.add_argument( "--dotclear", action="store_true", dest="dotclear", help="Dotclear export" ) + parser.add_argument( + "--medium", action="store_true", dest="medium", help="Medium export" + ) parser.add_argument( "--tumblr", action="store_true", dest="tumblr", help="Tumblr export" ) @@ -1068,6 +1209,8 @@ def main(): input_type = "blogger" elif args.dotclear: input_type = "dotclear" + elif args.medium: + input_type = "medium" elif args.tumblr: input_type = "tumblr" elif args.wpfile: @@ -1076,8 +1219,8 @@ def main(): input_type = "feed" else: error = ( - "You must provide either --blogger, --dotclear, " - "--tumblr, --wpfile or --feed options" + "You must provide one of --blogger, --dotclear, " + "--medium, --tumblr, --wpfile or --feed options" ) exit(error) @@ -1096,12 +1239,16 @@ def main(): fields = blogger2fields(args.input) elif input_type == "dotclear": fields = dc2fields(args.input) + elif input_type == "medium": + fields = mediumposts2fields(args.input) elif input_type == "tumblr": fields = tumblr2fields(args.input, args.blogname) elif input_type == "wordpress": fields = wp2fields(args.input, args.wp_custpost or False) elif input_type == "feed": fields = feed2fields(args.input) + else: + raise ValueError(f"Unhandled input_type {input_type}") if args.wp_attach: attachments = get_attachments(args.input) diff --git a/pelican/tools/pelican_quickstart.py b/pelican/tools/pelican_quickstart.py index 3fa56194..c00a252c 100755 --- a/pelican/tools/pelican_quickstart.py +++ b/pelican/tools/pelican_quickstart.py @@ -44,6 +44,7 @@ _TEMPLATES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "templ _jinja_env = Environment( loader=FileSystemLoader(_TEMPLATES_DIR), trim_blocks=True, + keep_trailing_newline=True, ) diff --git a/pelican/tools/templates/Makefile.jinja2 b/pelican/tools/templates/Makefile.jinja2 index 93ab1aa7..67571b47 100644 --- a/pelican/tools/templates/Makefile.jinja2 +++ b/pelican/tools/templates/Makefile.jinja2 @@ -37,6 +37,7 @@ DROPBOX_DIR={{dropbox_dir}} {% endif %} {% if github %} GITHUB_PAGES_BRANCH={{github_pages_branch}} +GITHUB_PAGES_COMMIT_MESSAGE=Generate Pelican site {% endif %} @@ -161,7 +162,7 @@ cf_upload: publish {% if github %} {% set upload = upload + ["github"] %} github: publish - ghp-import -m "Generate Pelican site" -b $(GITHUB_PAGES_BRANCH) "$(OUTPUTDIR)" + ghp-import -m "$(GITHUB_PAGES_COMMIT_MESSAGE)" -b $(GITHUB_PAGES_BRANCH) "$(OUTPUTDIR)" --no-jekyll git push origin $(GITHUB_PAGES_BRANCH) {% endif %} diff --git a/pelican/utils.py b/pelican/utils.py index 49949195..f4eeb8c4 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import datetime import fnmatch import locale @@ -16,6 +18,21 @@ from html import entities from html.parser import HTMLParser from itertools import groupby from operator import attrgetter +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Collection, + Dict, + Generator, + Iterable, + List, + Optional, + Sequence, + Tuple, + Type, + Union, +) import dateutil.parser @@ -26,10 +43,15 @@ except ModuleNotFoundError: import watchfiles from markupsafe import Markup +if TYPE_CHECKING: + from pelican.contents import Content + from pelican.readers import Readers + from pelican.settings import Settings + logger = logging.getLogger(__name__) -def sanitised_join(base_directory, *parts): +def sanitised_join(base_directory: str, *parts: str) -> str: joined = posixize_path(os.path.abspath(os.path.join(base_directory, *parts))) base = posixize_path(os.path.abspath(base_directory)) if not joined.startswith(base): @@ -38,7 +60,7 @@ def sanitised_join(base_directory, *parts): return joined -def strftime(date, date_format): +def strftime(date: datetime.datetime, date_format: str) -> str: """ Enhanced replacement for built-in strftime with zero stripping @@ -107,10 +129,10 @@ class DateFormatter: defined in LOCALE setting """ - def __init__(self): + def __init__(self) -> None: self.locale = locale.setlocale(locale.LC_TIME) - def __call__(self, date, date_format): + def __call__(self, date: datetime.datetime, date_format: str) -> str: # on OSX, encoding from LC_CTYPE determines the unicode output in PY3 # make sure it's same as LC_TIME with temporary_locale(self.locale, locale.LC_TIME), temporary_locale( @@ -129,11 +151,11 @@ class memoized: """ - def __init__(self, func): + def __init__(self, func: Callable) -> None: self.func = func - self.cache = {} + self.cache: Dict[Any, Any] = {} - def __call__(self, *args): + def __call__(self, *args) -> Any: if not isinstance(args, Hashable): # uncacheable. a list, for instance. # better to not cache than blow up. @@ -145,17 +167,23 @@ class memoized: self.cache[args] = value return value - def __repr__(self): + def __repr__(self) -> Optional[str]: return self.func.__doc__ - def __get__(self, obj, objtype): + def __get__(self, obj: Any, objtype): """Support instance methods.""" fn = partial(self.__call__, obj) fn.cache = self.cache return fn -def deprecated_attribute(old, new, since=None, remove=None, doc=None): +def deprecated_attribute( + old: str, + new: str, + since: Tuple[int, ...], + remove: Optional[Tuple[int, ...]] = None, + doc: Optional[str] = None, +): """Attribute deprecation decorator for gentle upgrades For example: @@ -196,7 +224,7 @@ def deprecated_attribute(old, new, since=None, remove=None, doc=None): return decorator -def get_date(string): +def get_date(string: str) -> datetime.datetime: """Return a datetime object from a string. If no format matches the given date, raise a ValueError. @@ -210,7 +238,9 @@ def get_date(string): @contextmanager -def pelican_open(filename, mode="r", strip_crs=(sys.platform == "win32")): +def pelican_open( + filename: str, mode: str = "r", strip_crs: bool = (sys.platform == "win32") +) -> Generator[str, None, None]: """Open a file and return its content""" # utf-8-sig will clear any BOM if present @@ -219,7 +249,12 @@ def pelican_open(filename, mode="r", strip_crs=(sys.platform == "win32")): yield content -def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False): +def slugify( + value: str, + regex_subs: Iterable[Tuple[str, str]] = (), + preserve_case: bool = False, + use_unicode: bool = False, +) -> str: """ Normalizes string, converts to lowercase, removes non-alpha characters, and converts spaces to hyphens. @@ -234,7 +269,7 @@ def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False): import unidecode - def normalize_unicode(text): + def normalize_unicode(text: str) -> str: # normalize text by compatibility composition # see: https://en.wikipedia.org/wiki/Unicode_equivalence return unicodedata.normalize("NFKC", text) @@ -261,7 +296,9 @@ def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False): return value.strip() -def copy(source, destination, ignores=None): +def copy( + source: str, destination: str, ignores: Optional[Iterable[str]] = None +) -> None: """Recursively copy source into destination. If source is a file, destination has to be a file as well. @@ -333,7 +370,7 @@ def copy(source, destination, ignores=None): ) -def copy_file(source, destination): +def copy_file(source: str, destination: str) -> None: """Copy a file""" try: shutil.copyfile(source, destination) @@ -343,7 +380,7 @@ def copy_file(source, destination): ) -def clean_output_dir(path, retention): +def clean_output_dir(path: str, retention: Iterable[str]) -> None: """Remove all files from output directory except those in retention list""" if not os.path.exists(path): @@ -380,24 +417,24 @@ def clean_output_dir(path, retention): logger.error("Unable to delete %s, file type unknown", file) -def get_relative_path(path): +def get_relative_path(path: str) -> str: """Return the relative path from the given path to the root path.""" components = split_all(path) - if len(components) <= 1: + if components is None or len(components) <= 1: return os.curdir else: parents = [os.pardir] * (len(components) - 1) return os.path.join(*parents) -def path_to_url(path): +def path_to_url(path: str) -> str: """Return the URL corresponding to a given path.""" if path is not None: path = posixize_path(path) return path -def posixize_path(rel_path): +def posixize_path(rel_path: str) -> str: """Use '/' as path separator, so that source references, like '{static}/foo/bar.jpg' or 'extras/favicon.ico', will work on Windows as well as on Mac and Linux.""" @@ -426,20 +463,20 @@ class _HTMLWordTruncator(HTMLParser): _singlets = ("br", "col", "link", "base", "img", "param", "area", "hr", "input") class TruncationCompleted(Exception): - def __init__(self, truncate_at): + def __init__(self, truncate_at: int) -> None: super().__init__(truncate_at) self.truncate_at = truncate_at - def __init__(self, max_words): + def __init__(self, max_words: int) -> None: super().__init__(convert_charrefs=False) self.max_words = max_words self.words_found = 0 self.open_tags = [] self.last_word_end = None - self.truncate_at = None + self.truncate_at: Optional[int] = None - def feed(self, *args, **kwargs): + def feed(self, *args, **kwargs) -> None: try: super().feed(*args, **kwargs) except self.TruncationCompleted as exc: @@ -447,29 +484,29 @@ class _HTMLWordTruncator(HTMLParser): else: self.truncate_at = None - def getoffset(self): + def getoffset(self) -> int: line_start = 0 lineno, line_offset = self.getpos() for i in range(lineno - 1): line_start = self.rawdata.index("\n", line_start) + 1 return line_start + line_offset - def add_word(self, word_end): + def add_word(self, word_end: int) -> None: self.words_found += 1 self.last_word_end = None if self.words_found == self.max_words: raise self.TruncationCompleted(word_end) - def add_last_word(self): + def add_last_word(self) -> None: if self.last_word_end is not None: self.add_word(self.last_word_end) - def handle_starttag(self, tag, attrs): + def handle_starttag(self, tag: str, attrs: Any) -> None: self.add_last_word() if tag not in self._singlets: self.open_tags.insert(0, tag) - def handle_endtag(self, tag): + def handle_endtag(self, tag: str) -> None: self.add_last_word() try: i = self.open_tags.index(tag) @@ -480,7 +517,7 @@ class _HTMLWordTruncator(HTMLParser): # all unclosed intervening start tags with omitted end tags del self.open_tags[: i + 1] - def handle_data(self, data): + def handle_data(self, data: str) -> None: word_end = 0 offset = self.getoffset() @@ -498,7 +535,7 @@ class _HTMLWordTruncator(HTMLParser): if word_end < len(data): self.add_last_word() - def _handle_ref(self, name, char): + def _handle_ref(self, name: str, char: str) -> None: """ Called by handle_entityref() or handle_charref() when a ref like `—`, `—`, or `—` is found. @@ -542,7 +579,7 @@ class _HTMLWordTruncator(HTMLParser): else: self.add_last_word() - def handle_entityref(self, name): + def handle_entityref(self, name: str) -> None: """ Called when an entity ref like '—' is found @@ -555,7 +592,7 @@ class _HTMLWordTruncator(HTMLParser): char = "" self._handle_ref(name, char) - def handle_charref(self, name): + def handle_charref(self, name: str) -> None: """ Called when a char ref like '—' or '—' is found @@ -573,7 +610,7 @@ class _HTMLWordTruncator(HTMLParser): self._handle_ref("#" + name, char) -def truncate_html_words(s, num, end_text="…"): +def truncate_html_words(s: str, num: int, end_text: str = "…") -> str: """Truncates HTML to a certain number of words. (not counting tags and comments). Closes opened tags if they were correctly @@ -599,7 +636,10 @@ def truncate_html_words(s, num, end_text="…"): return out -def process_translations(content_list, translation_id=None): +def process_translations( + content_list: List[Content], + translation_id: Optional[Union[str, Collection[str]]] = None, +) -> Tuple[List[Content], List[Content]]: """Finds translations and returns them. For each content_list item, populates the 'translations' attribute, and @@ -657,7 +697,7 @@ def process_translations(content_list, translation_id=None): return index, translations -def get_original_items(items, with_str): +def get_original_items(items: List[Content], with_str: str) -> List[Content]: def _warn_source_paths(msg, items, *extra): args = [len(items)] args.extend(extra) @@ -697,7 +737,10 @@ def get_original_items(items, with_str): return original_items -def order_content(content_list, order_by="slug"): +def order_content( + content_list: List[Content], + order_by: Union[str, Callable[[Content], Any], None] = "slug", +) -> List[Content]: """Sorts content. order_by can be a string of an attribute or sorting function. If order_by @@ -757,7 +800,11 @@ def order_content(content_list, order_by="slug"): return content_list -def wait_for_changes(settings_file, reader_class, settings): +def wait_for_changes( + settings_file: str, + reader_class: Type["Readers"], + settings: "Settings", +): content_path = settings.get("PATH", "") theme_path = settings.get("THEME", "") ignore_files = { @@ -787,13 +834,15 @@ def wait_for_changes(settings_file, reader_class, settings): return next( watchfiles.watch( *watching_paths, - watch_filter=watchfiles.DefaultFilter(ignore_entity_patterns=ignore_files), + watch_filter=watchfiles.DefaultFilter(ignore_entity_patterns=ignore_files), # type: ignore rust_timeout=0, ) ) -def set_date_tzinfo(d, tz_name=None): +def set_date_tzinfo( + d: datetime.datetime, tz_name: Optional[str] = None +) -> datetime.datetime: """Set the timezone for dates that don't have tzinfo""" if tz_name and not d.tzinfo: timezone = ZoneInfo(tz_name) @@ -804,11 +853,11 @@ def set_date_tzinfo(d, tz_name=None): return d -def mkdir_p(path): +def mkdir_p(path: str) -> None: os.makedirs(path, exist_ok=True) -def split_all(path): +def split_all(path: Union[str, pathlib.Path, None]) -> Optional[Sequence[str]]: """Split a path into a list of components While os.path.split() splits a single component off the back of @@ -839,12 +888,12 @@ def split_all(path): ) -def path_to_file_url(path): +def path_to_file_url(path: str) -> str: """Convert file-system path to file:// URL""" return urllib.parse.urljoin("file://", urllib.request.pathname2url(path)) -def maybe_pluralize(count, singular, plural): +def maybe_pluralize(count: int, singular: str, plural: str) -> str: """ Returns a formatted string containing count and plural if count is not 1 Returns count and singular if count is 1 @@ -861,7 +910,9 @@ def maybe_pluralize(count, singular, plural): @contextmanager -def temporary_locale(temp_locale=None, lc_category=locale.LC_ALL): +def temporary_locale( + temp_locale: Optional[str] = None, lc_category: int = locale.LC_ALL +) -> Generator[None, None, None]: """ Enable code to run in a context with a temporary locale Resets the locale back when exiting context. diff --git a/pyproject.toml b/pyproject.toml index 18ff6afa..503405a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ changelog-header = "###############" version-header = "=" [tool.pdm] +ignore_package_warnings = ["sphinx"] [tool.pdm.scripts] docbuild = "invoke docbuild" @@ -95,7 +96,7 @@ dev = [ "pytest-xdist>=3.4.0", "tox>=4.11.3", "invoke>=2.2.0", - "ruff>=0.1.5", + "ruff>=0.1.15,<0.2.0", "tomli>=2.0.1; python_version < \"3.11\"", ]