pelican-theme/pelican/contents.py

import copy
import datetime
import locale
import logging
import os
import re
from datetime import timezone
from html import unescape
from urllib.parse import unquote, urljoin, urlparse, urlunparse

try:
    from zoneinfo import ZoneInfo
except ModuleNotFoundError:
    from backports.zoneinfo import ZoneInfo


from pelican.plugins import signals
from pelican.settings import DEFAULT_CONFIG
from pelican.utils import (
    deprecated_attribute,
    memoized,
    path_to_url,
    posixize_path,
    sanitised_join,
    set_date_tzinfo,
    slugify,
    truncate_html_words,
)

# Import these so that they're available when you import from pelican.contents.
from pelican.urlwrappers import Author, Category, Tag, URLWrapper  # NOQA

logger = logging.getLogger(__name__)


class Content:
    """Represents a content.

    :param content: the string to parse, containing the original content.
    :param metadata: the metadata associated to this page (optional).
    :param settings: the settings dictionary (optional).
    :param source_path: The location of the source of this content (if any).
    :param context: The shared context between generators.

    """

    @deprecated_attribute(old="filename", new="source_path", since=(3, 2, 0))
    def filename():
        return None

    def __init__(
        self, content, metadata=None, settings=None, source_path=None, context=None
    ):
        if metadata is None:
            metadata = {}
        if settings is None:
            settings = copy.deepcopy(DEFAULT_CONFIG)

        self.settings = settings
        self._content = content
        if context is None:
            context = {}
        self._context = context
        self.translations = []

        local_metadata = dict()
        local_metadata.update(metadata)

        # set metadata as attributes
        for key, value in local_metadata.items():
            if key in ("save_as", "url"):
                key = "override_" + key
            setattr(self, key.lower(), value)

        # also keep track of the metadata attributes available
        self.metadata = local_metadata

        # default template if it's not defined in page
        self.template = self._get_template()

        # First, read the authors from "authors", if not, fallback to "author"
        # and if not use the settings defined one, if any.
        if not hasattr(self, "author"):
            if hasattr(self, "authors"):
                self.author = self.authors[0]
            elif "AUTHOR" in settings:
                self.author = Author(settings["AUTHOR"], settings)

        if not hasattr(self, "authors") and hasattr(self, "author"):
            self.authors = [self.author]

        # XXX Split all the following code into pieces, there is too much here.

        # manage languages
        self.in_default_lang = True
        if "DEFAULT_LANG" in settings:
            default_lang = settings["DEFAULT_LANG"].lower()
            if not hasattr(self, "lang"):
                self.lang = default_lang

            self.in_default_lang = self.lang == default_lang

        # create the slug if not existing, generate slug according to
        # setting of SLUG_ATTRIBUTE
        if not hasattr(self, "slug"):
            if settings["SLUGIFY_SOURCE"] == "title" and hasattr(self, "title"):
                value = self.title
            elif settings["SLUGIFY_SOURCE"] == "basename" and source_path is not None:
                value = os.path.basename(os.path.splitext(source_path)[0])
            else:
                value = None
            if value is not None:
                self.slug = slugify(
                    value,
                    regex_subs=settings.get("SLUG_REGEX_SUBSTITUTIONS", []),
                    preserve_case=settings.get("SLUGIFY_PRESERVE_CASE", False),
                    use_unicode=settings.get("SLUGIFY_USE_UNICODE", False),
                )

        self.source_path = source_path
        self.relative_source_path = self.get_relative_source_path()

        # manage the date format
        if not hasattr(self, "date_format"):
            if hasattr(self, "lang") and self.lang in settings["DATE_FORMATS"]:
                self.date_format = settings["DATE_FORMATS"][self.lang]
            else:
                self.date_format = settings["DEFAULT_DATE_FORMAT"]

        if isinstance(self.date_format, tuple):
            locale_string = self.date_format[0]
            locale.setlocale(locale.LC_ALL, locale_string)
            self.date_format = self.date_format[1]

        # manage timezone
        default_timezone = settings.get("TIMEZONE", "UTC")
        timezone = getattr(self, "timezone", default_timezone)
        self.timezone = ZoneInfo(timezone)

        if hasattr(self, "date"):
            self.date = set_date_tzinfo(self.date, timezone)
            self.locale_date = self.date.strftime(self.date_format)

        if hasattr(self, "modified"):
            self.modified = set_date_tzinfo(self.modified, timezone)
            self.locale_modified = self.modified.strftime(self.date_format)

        # manage status
        if not hasattr(self, "status"):
            # Previous default of None broke comment plugins and perhaps others
            self.status = getattr(self, "default_status", "")

        # store the summary metadata if it is set
        if "summary" in metadata:
            self._summary = metadata["summary"]

        signals.content_object_init.send(self)

    def __str__(self):
        return self.source_path or repr(self)

    def _has_valid_mandatory_properties(self):
        """Test mandatory properties are set."""
        for prop in self.mandatory_properties:
            if not hasattr(self, prop):
                logger.error(
                    "Skipping %s: could not find information about '%s'", self, prop
                )
                return False
        return True

    def _has_valid_save_as(self):
        """Return true if save_as doesn't write outside output path, false
        otherwise."""
        try:
            output_path = self.settings["OUTPUT_PATH"]
        except KeyError:
            # we cannot check
            return True

        try:
            sanitised_join(output_path, self.save_as)
        except RuntimeError:  # outside output_dir
            logger.error(
                "Skipping %s: file %r would be written outside output path",
                self,
                self.save_as,
            )
            return False

        return True

    def _has_valid_status(self):
        if hasattr(self, "allowed_statuses"):
            if self.status not in self.allowed_statuses:
                logger.error(
                    "Unknown status '%s' for file %s, skipping it. (Not in %s)",
                    self.status,
                    self,
                    self.allowed_statuses,
                )
                return False

        # if undefined we allow all
        return True

    def is_valid(self):
        """Validate Content"""
        # Use all() to not short circuit and get results of all validations
        return all(
            [
                self._has_valid_mandatory_properties(),
                self._has_valid_save_as(),
                self._has_valid_status(),
            ]
        )

    @property
    def url_format(self):
        """Returns the URL, formatted with the proper values"""
        metadata = copy.copy(self.metadata)
        path = self.metadata.get("path", self.get_relative_source_path())
        metadata.update(
            {
                "path": path_to_url(path),
                "slug": getattr(self, "slug", ""),
                "lang": getattr(self, "lang", "en"),
                "date": getattr(self, "date", datetime.datetime.now()),
                "author": self.author.slug if hasattr(self, "author") else "",
                "category": self.category.slug if hasattr(self, "category") else "",
            }
        )
        return metadata

    def _expand_settings(self, key, klass=None):
        if not klass:
            klass = self.__class__.__name__
        fq_key = ("{}_{}".format(klass, key)).upper()
        return str(self.settings[fq_key]).format(**self.url_format)

    def get_url_setting(self, key):
        if hasattr(self, "override_" + key):
            return getattr(self, "override_" + key)
        key = key if self.in_default_lang else "lang_%s" % key
        return self._expand_settings(key)

    def _link_replacer(self, siteurl, m):
        what = m.group("what")
        value = urlparse(m.group("value"))
        path = value.path
        origin = m.group("path")

        # urllib.parse.urljoin() produces `a.html` for urljoin("..", "a.html")
        # so if RELATIVE_URLS are enabled, we fall back to os.path.join() to
        # properly get `../a.html`. However, os.path.join() produces
        # `baz/http://foo/bar.html` for join("baz", "http://foo/bar.html")
        # instead of correct "http://foo/bar.html", so one has to pick a side
        # as there is no silver bullet.
        if self.settings["RELATIVE_URLS"]:
            joiner = os.path.join
        else:
            joiner = urljoin

            # However, it's not *that* simple: urljoin("blog", "index.html")
            # produces just `index.html` instead of `blog/index.html` (unlike
            # os.path.join()), so in order to get a correct answer one needs to
            # append a trailing slash to siteurl in that case. This also makes
            # the new behavior fully compatible with Pelican 3.7.1.
            if not siteurl.endswith("/"):
                siteurl += "/"

        # XXX Put this in a different location.
        if what in {"filename", "static", "attach"}:

            def _get_linked_content(key, url):
                nonlocal value

                def _find_path(path):
                    if path.startswith("/"):
                        path = path[1:]
                    else:
                        # relative to the source path of this content
                        path = self.get_relative_source_path(
                            os.path.join(self.relative_dir, path)
                        )
                    return self._context[key].get(path, None)

                # try path
                result = _find_path(url.path)
                if result is not None:
                    return result

                # try unquoted path
                result = _find_path(unquote(url.path))
                if result is not None:
                    return result

                # try html unescaped url
                unescaped_url = urlparse(unescape(url.geturl()))
                result = _find_path(unescaped_url.path)
                if result is not None:
                    value = unescaped_url
                    return result

                # check if a static file is linked with {filename}
                if what == "filename" and key == "generated_content":
                    linked_content = _get_linked_content("static_content", value)
                    if linked_content:
                        logger.warning(
                            "{filename} used for linking to static"
                            " content %s in %s. Use {static} instead",
                            value.path,
                            self.get_relative_source_path(),
                        )
                        return linked_content

                return None

            if what == "filename":
                key = "generated_content"
            else:
                key = "static_content"

            linked_content = _get_linked_content(key, value)
            if linked_content:
                if what == "attach":
                    linked_content.attach_to(self)
                origin = joiner(siteurl, linked_content.url)
                origin = origin.replace("\\", "/")  # for Windows paths.
            else:
                logger.warning(
                    "Unable to find '%s', skipping url replacement.",
                    value.geturl(),
                    extra={
                        "limit_msg": (
                            "Other resources were not found "
                            "and their urls not replaced"
                        )
                    },
                )
        elif what == "category":
            origin = joiner(siteurl, Category(path, self.settings).url)
        elif what == "tag":
            origin = joiner(siteurl, Tag(path, self.settings).url)
        elif what == "index":
            origin = joiner(siteurl, self.settings["INDEX_SAVE_AS"])
        elif what == "author":
            origin = joiner(siteurl, Author(path, self.settings).url)
        else:
            logger.warning(
                "Replacement Indicator '%s' not recognized, " "skipping replacement",
                what,
            )

        # keep all other parts, such as query, fragment, etc.
        parts = list(value)
        parts[2] = origin
        origin = urlunparse(parts)

        return "".join((m.group("markup"), m.group("quote"), origin, m.group("quote")))

    def _get_intrasite_link_regex(self):
        intrasite_link_regex = self.settings["INTRASITE_LINK_REGEX"]
        regex = r"""
            (?P<markup><[^\>]+  # match tag with all url-value attributes
                (?:href|src|poster|data|cite|formaction|action|content)\s*=\s*)

            (?P<quote>["\'])      # require value to be quoted
            (?P<path>{}(?P<value>.*?))  # the url value
            (?P=quote)""".format(intrasite_link_regex)
        return re.compile(regex, re.X)

    def _update_content(self, content, siteurl):
        """Update the content attribute.

        Change all the relative paths of the content to relative paths
        suitable for the output content.

        :param content: content resource that will be passed to the templates.
        :param siteurl: siteurl which is locally generated by the writer in
                        case of RELATIVE_URLS.
        """
        if not content:
            return content

        hrefs = self._get_intrasite_link_regex()
        return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content)

    def get_static_links(self):
        static_links = set()
        hrefs = self._get_intrasite_link_regex()
        for m in hrefs.finditer(self._content):
            what = m.group("what")
            value = urlparse(m.group("value"))
            path = value.path
            if what not in {"static", "attach"}:
                continue
            if path.startswith("/"):
                path = path[1:]
            else:
                # relative to the source path of this content
                path = self.get_relative_source_path(
                    os.path.join(self.relative_dir, path)
                )
            path = path.replace("%20", " ")
            static_links.add(path)
        return static_links

    def get_siteurl(self):
        return self._context.get("localsiteurl", "")

    @memoized
    def get_content(self, siteurl):
        if hasattr(self, "_get_content"):
            content = self._get_content()
        else:
            content = self._content
        return self._update_content(content, siteurl)

    @property
    def content(self):
        return self.get_content(self.get_siteurl())

    @memoized
    def get_summary(self, siteurl):
        """Returns the summary of an article.

        This is based on the summary metadata if set, otherwise truncate the
        content.
        """
        if "summary" in self.metadata:
            return self.metadata["summary"]

        if self.settings["SUMMARY_MAX_LENGTH"] is None:
            return self.content

        return truncate_html_words(
            self.content,
            self.settings["SUMMARY_MAX_LENGTH"],
            self.settings["SUMMARY_END_SUFFIX"],
        )

    @property
    def summary(self):
        return self.get_summary(self.get_siteurl())

    def _get_summary(self):
        """deprecated function to access summary"""

        logger.warning(
            "_get_summary() has been deprecated since 3.6.4. "
            "Use the summary decorator instead"
        )
        return self.summary

    @summary.setter
    def summary(self, value):
        """Dummy function"""
        pass

    @property
    def status(self):
        return self._status

    @status.setter
    def status(self, value):
        # TODO maybe typecheck
        self._status = value.lower()

    @property
    def url(self):
        return self.get_url_setting("url")

    @property
    def save_as(self):
        return self.get_url_setting("save_as")

    def _get_template(self):
        if hasattr(self, "template") and self.template is not None:
            return self.template
        else:
            return self.default_template

    def get_relative_source_path(self, source_path=None):
        """Return the relative path (from the content path) to the given
        source_path.

        If no source path is specified, use the source path of this
        content object.
        """
        if not source_path:
            source_path = self.source_path
        if source_path is None:
            return None

        return posixize_path(
            os.path.relpath(
                os.path.abspath(os.path.join(self.settings["PATH"], source_path)),
                os.path.abspath(self.settings["PATH"]),
            )
        )

    @property
    def relative_dir(self):
        return posixize_path(
            os.path.dirname(
                os.path.relpath(
                    os.path.abspath(self.source_path),
                    os.path.abspath(self.settings["PATH"]),
                )
            )
        )

    def refresh_metadata_intersite_links(self):
        for key in self.settings["FORMATTED_FIELDS"]:
            if key in self.metadata and key != "summary":
                value = self._update_content(self.metadata[key], self.get_siteurl())
                self.metadata[key] = value
                setattr(self, key.lower(), value)

        # _summary is an internal variable that some plugins may be writing to,
        # so ensure changes to it are picked up
        if (
            "summary" in self.settings["FORMATTED_FIELDS"]
            and "summary" in self.metadata
        ):
            self._summary = self._update_content(self._summary, self.get_siteurl())
            self.metadata["summary"] = self._summary


class Page(Content):
    mandatory_properties = ("title",)
    allowed_statuses = ("published", "hidden", "draft")
    default_status = "published"
    default_template = "page"

    def _expand_settings(self, key):
        klass = "draft_page" if self.status == "draft" else None
        return super()._expand_settings(key, klass)


class Article(Content):
    mandatory_properties = ("title", "date", "category")
    allowed_statuses = ("published", "hidden", "draft")
    default_status = "published"
    default_template = "article"

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # handle WITH_FUTURE_DATES (designate article to draft based on date)
        if not self.settings["WITH_FUTURE_DATES"] and hasattr(self, "date"):
            if self.date.tzinfo is None:
                now = datetime.datetime.now()
            else:
                now = datetime.datetime.utcnow().replace(tzinfo=timezone.utc)
            if self.date > now:
                self.status = "draft"

        # if we are a draft and there is no date provided, set max datetime
        if not hasattr(self, "date") and self.status == "draft":
            self.date = datetime.datetime.max.replace(tzinfo=self.timezone)

    def _expand_settings(self, key):
        klass = "draft" if self.status == "draft" else "article"
        return super()._expand_settings(key, klass)


class Static(Content):
    mandatory_properties = ("title",)
    default_status = "published"
    default_template = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._output_location_referenced = False

    @deprecated_attribute(old="filepath", new="source_path", since=(3, 2, 0))
    def filepath():
        return None

    @deprecated_attribute(old="src", new="source_path", since=(3, 2, 0))
    def src():
        return None

    @deprecated_attribute(old="dst", new="save_as", since=(3, 2, 0))
    def dst():
        return None

    @property
    def url(self):
        # Note when url has been referenced, so we can avoid overriding it.
        self._output_location_referenced = True
        return super().url

    @property
    def save_as(self):
        # Note when save_as has been referenced, so we can avoid overriding it.
        self._output_location_referenced = True
        return super().save_as

    def attach_to(self, content):
        """Override our output directory with that of the given content object."""

        # Determine our file's new output path relative to the linking
        # document. If it currently lives beneath the linking
        # document's source directory, preserve that relationship on output.
        # Otherwise, make it a sibling.

        linking_source_dir = os.path.dirname(content.source_path)
        tail_path = os.path.relpath(self.source_path, linking_source_dir)
        if tail_path.startswith(os.pardir + os.sep):
            tail_path = os.path.basename(tail_path)
        new_save_as = os.path.join(os.path.dirname(content.save_as), tail_path)

        # We do not build our new url by joining tail_path with the linking
        # document's url, because we cannot know just by looking at the latter
        # whether it points to the document itself or to its parent directory.
        # (An url like 'some/content' might mean a directory named 'some'
        # with a file named 'content', or it might mean a directory named
        # 'some/content' with a file named 'index.html'.) Rather than trying
        # to figure it out by comparing the linking document's url and save_as
        # path, we simply build our new url from our new save_as path.

        new_url = path_to_url(new_save_as)

        def _log_reason(reason):
            logger.warning(
                "The {attach} link in %s cannot relocate "
                "%s because %s. Falling back to "
                "{filename} link behavior instead.",
                content.get_relative_source_path(),
                self.get_relative_source_path(),
                reason,
                extra={"limit_msg": "More {attach} warnings silenced."},
            )

        # We never override an override, because we don't want to interfere
        # with user-defined overrides that might be in EXTRA_PATH_METADATA.
        if hasattr(self, "override_save_as") or hasattr(self, "override_url"):
            if new_save_as != self.save_as or new_url != self.url:
                _log_reason("its output location was already overridden")
            return

        # We never change an output path that has already been referenced,
        # because we don't want to break links that depend on that path.
        if self._output_location_referenced:
            if new_save_as != self.save_as or new_url != self.url:
                _log_reason("another link already referenced its location")
            return

        self.override_save_as = new_save_as
        self.override_url = new_url