1
0
Fork 0
forked from github/pelican
pelican-theme/pelican/contents.py
winlu 089b46b7eb Consolidate validation of content (#2128)
* Consolidate validation of content

Previously we validated content outside of the content class via
calls to `is_valid_content` and some additional checks in page /
article generators (valid status).
This commit moves those checks all into content.valid() resulting
in a cleaner code structure.
This allows us to restructure how generators interact with content,
removing several old bugs in pelican (#1748, #1356, #2098).

- move verification function into content class
- move generator verifying content to contents class
- remove unused quote class
- remove draft class (no more rereading drafts)
- move auto draft status setter into Article.__init__
- add now parsing draft to basic test output
- remove problematic DEFAULT_STATUS setting
- add setter/getter for content.status
  removes need for lower() calls when verifying status

* expand c4b184fa32

Mostly implement feedback by @iKevinY.

* rename content.valid to content.is_valid
* rename valid_* functions to has_valid_*
* update tests and function calls in code accordingly
2017-07-24 10:01:14 -07:00

532 lines
19 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
import copy
import locale
import logging
import os
import re
import sys
import pytz
import six
from six.moves.urllib.parse import urlparse, urlunparse
from pelican import signals
from pelican.settings import DEFAULT_CONFIG
from pelican.utils import (SafeDatetime, deprecated_attribute, memoized,
path_to_url, posixize_path,
python_2_unicode_compatible, sanitised_join,
set_date_tzinfo, slugify, strftime,
truncate_html_words)
# Import these so that they're avalaible when you import from pelican.contents.
from pelican.urlwrappers import (Author, Category, Tag, URLWrapper) # NOQA
logger = logging.getLogger(__name__)
@python_2_unicode_compatible
class Content(object):
"""Represents a content.
:param content: the string to parse, containing the original content.
:param metadata: the metadata associated to this page (optional).
:param settings: the settings dictionary (optional).
:param source_path: The location of the source of this content (if any).
:param context: The shared context between generators.
"""
@deprecated_attribute(old='filename', new='source_path', since=(3, 2, 0))
def filename():
return None
def __init__(self, content, metadata=None, settings=None,
source_path=None, context=None):
if metadata is None:
metadata = {}
if settings is None:
settings = copy.deepcopy(DEFAULT_CONFIG)
self.settings = settings
self._content = content
if context is None:
context = {}
self._context = context
self.translations = []
local_metadata = dict()
local_metadata.update(metadata)
# set metadata as attributes
for key, value in local_metadata.items():
if key in ('save_as', 'url'):
key = 'override_' + key
setattr(self, key.lower(), value)
# also keep track of the metadata attributes available
self.metadata = local_metadata
# default template if it's not defined in page
self.template = self._get_template()
# First, read the authors from "authors", if not, fallback to "author"
# and if not use the settings defined one, if any.
if not hasattr(self, 'author'):
if hasattr(self, 'authors'):
self.author = self.authors[0]
elif 'AUTHOR' in settings:
self.author = Author(settings['AUTHOR'], settings)
if not hasattr(self, 'authors') and hasattr(self, 'author'):
self.authors = [self.author]
# XXX Split all the following code into pieces, there is too much here.
# manage languages
self.in_default_lang = True
if 'DEFAULT_LANG' in settings:
default_lang = settings['DEFAULT_LANG'].lower()
if not hasattr(self, 'lang'):
self.lang = default_lang
self.in_default_lang = (self.lang == default_lang)
# create the slug if not existing, generate slug according to
# setting of SLUG_ATTRIBUTE
if not hasattr(self, 'slug'):
if (settings['SLUGIFY_SOURCE'] == 'title' and
hasattr(self, 'title')):
self.slug = slugify(self.title,
settings.get('SLUG_SUBSTITUTIONS', ()))
elif (settings['SLUGIFY_SOURCE'] == 'basename' and
source_path is not None):
basename = os.path.basename(
os.path.splitext(source_path)[0])
self.slug = slugify(
basename, settings.get('SLUG_SUBSTITUTIONS', ()))
self.source_path = source_path
# manage the date format
if not hasattr(self, 'date_format'):
if hasattr(self, 'lang') and self.lang in settings['DATE_FORMATS']:
self.date_format = settings['DATE_FORMATS'][self.lang]
else:
self.date_format = settings['DEFAULT_DATE_FORMAT']
if isinstance(self.date_format, tuple):
locale_string = self.date_format[0]
if sys.version_info < (3, ) and isinstance(locale_string,
six.text_type):
locale_string = locale_string.encode('ascii')
locale.setlocale(locale.LC_ALL, locale_string)
self.date_format = self.date_format[1]
# manage timezone
default_timezone = settings.get('TIMEZONE', 'UTC')
timezone = getattr(self, 'timezone', default_timezone)
if hasattr(self, 'date'):
self.date = set_date_tzinfo(self.date, timezone)
self.locale_date = strftime(self.date, self.date_format)
if hasattr(self, 'modified'):
self.modified = set_date_tzinfo(self.modified, timezone)
self.locale_modified = strftime(self.modified, self.date_format)
# manage status
if not hasattr(self, 'status'):
self.status = getattr(self, 'default_status', None)
# store the summary metadata if it is set
if 'summary' in metadata:
self._summary = metadata['summary']
signals.content_object_init.send(self)
def __str__(self):
return self.source_path or repr(self)
def _has_valid_mandatory_properties(self):
"""Test mandatory properties are set."""
for prop in self.mandatory_properties:
if not hasattr(self, prop):
logger.error(
"Skipping %s: could not find information about '%s'",
self, prop)
return False
return True
def _has_valid_save_as(self):
"""Return true if save_as doesn't write outside output path, false
otherwise."""
try:
output_path = self.settings["OUTPUT_PATH"]
except KeyError:
# we cannot check
return True
try:
sanitised_join(output_path, self.save_as)
except RuntimeError: # outside output_dir
logger.error(
"Skipping %s: file %r would be written outside output path",
self,
self.save_as,
)
return False
return True
def _has_valid_status(self):
if hasattr(self, 'allowed_statuses'):
if self.status not in self.allowed_statuses:
logger.error(
"Unknown status '%s' for file %s, skipping it.",
self.status,
self
)
return False
# if undefined we allow all
return True
def is_valid(self):
"""Validate Content"""
# Use all() to not short circuit and get results of all validations
return all([self._has_valid_mandatory_properties(),
self._has_valid_save_as(),
self._has_valid_status()])
@property
def url_format(self):
"""Returns the URL, formatted with the proper values"""
metadata = copy.copy(self.metadata)
path = self.metadata.get('path', self.get_relative_source_path())
metadata.update({
'path': path_to_url(path),
'slug': getattr(self, 'slug', ''),
'lang': getattr(self, 'lang', 'en'),
'date': getattr(self, 'date', SafeDatetime.now()),
'author': self.author.slug if hasattr(self, 'author') else '',
'tag': self.tag.slug if hasattr(self, 'tag') else '',
'category': self.category.slug if hasattr(self, 'category') else ''
})
return metadata
def _expand_settings(self, key, klass=None):
if not klass:
klass = self.__class__.__name__
fq_key = ('%s_%s' % (klass, key)).upper()
return self.settings[fq_key].format(**self.url_format)
def get_url_setting(self, key):
if hasattr(self, 'override_' + key):
return getattr(self, 'override_' + key)
key = key if self.in_default_lang else 'lang_%s' % key
return self._expand_settings(key)
def _update_content(self, content, siteurl):
"""Update the content attribute.
Change all the relative paths of the content to relative paths
suitable for the output content.
:param content: content resource that will be passed to the templates.
:param siteurl: siteurl which is locally generated by the writer in
case of RELATIVE_URLS.
"""
if not content:
return content
instrasite_link_regex = self.settings['INTRASITE_LINK_REGEX']
regex = r"""
(?P<markup><[^\>]+ # match tag with all url-value attributes
(?:href|src|poster|data|cite|formaction|action)\s*=\s*)
(?P<quote>["\']) # require value to be quoted
(?P<path>{0}(?P<value>.*?)) # the url value
\2""".format(instrasite_link_regex)
hrefs = re.compile(regex, re.X)
def replacer(m):
what = m.group('what')
value = urlparse(m.group('value'))
path = value.path
origin = m.group('path')
# XXX Put this in a different location.
if what in {'filename', 'attach'}:
if path.startswith('/'):
path = path[1:]
else:
# relative to the source path of this content
path = self.get_relative_source_path(
os.path.join(self.relative_dir, path)
)
if path not in self._context['filenames']:
unquoted_path = path.replace('%20', ' ')
if unquoted_path in self._context['filenames']:
path = unquoted_path
linked_content = self._context['filenames'].get(path)
if linked_content:
if what == 'attach':
if isinstance(linked_content, Static):
linked_content.attach_to(self)
else:
logger.warning(
"%s used {attach} link syntax on a "
"non-static file. Use {filename} instead.",
self.get_relative_source_path())
origin = '/'.join((siteurl, linked_content.url))
origin = origin.replace('\\', '/') # for Windows paths.
else:
logger.warning(
"Unable to find '%s', skipping url replacement.",
value.geturl(), extra={
'limit_msg': ("Other resources were not found "
"and their urls not replaced")})
elif what == 'category':
origin = '/'.join((siteurl, Category(path, self.settings).url))
elif what == 'tag':
origin = '/'.join((siteurl, Tag(path, self.settings).url))
elif what == 'index':
origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
elif what == 'author':
origin = '/'.join((siteurl, Author(path, self.settings).url))
else:
logger.warning(
"Replacement Indicator '%s' not recognized, "
"skipping replacement",
what)
# keep all other parts, such as query, fragment, etc.
parts = list(value)
parts[2] = origin
origin = urlunparse(parts)
return ''.join((m.group('markup'), m.group('quote'), origin,
m.group('quote')))
return hrefs.sub(replacer, content)
def get_siteurl(self):
return self._context.get('localsiteurl', '')
@memoized
def get_content(self, siteurl):
if hasattr(self, '_get_content'):
content = self._get_content()
else:
content = self._content
return self._update_content(content, siteurl)
@property
def content(self):
return self.get_content(self.get_siteurl())
@memoized
def get_summary(self, siteurl):
"""Returns the summary of an article.
This is based on the summary metadata if set, otherwise truncate the
content.
"""
if hasattr(self, '_summary'):
return self._update_content(self._summary, siteurl)
if self.settings['SUMMARY_MAX_LENGTH'] is None:
return self.content
return truncate_html_words(self.content,
self.settings['SUMMARY_MAX_LENGTH'])
@property
def summary(self):
return self.get_summary(self.get_siteurl())
def _get_summary(self):
"""deprecated function to access summary"""
logger.warning('_get_summary() has been deprecated since 3.6.4. '
'Use the summary decorator instead')
return self.summary
@summary.setter
def summary(self, value):
"""Dummy function"""
pass
@property
def status(self):
return self._status
@status.setter
def status(self, value):
# TODO maybe typecheck
self._status = value.lower()
@property
def url(self):
return self.get_url_setting('url')
@property
def save_as(self):
return self.get_url_setting('save_as')
def _get_template(self):
if hasattr(self, 'template') and self.template is not None:
return self.template
else:
return self.default_template
def get_relative_source_path(self, source_path=None):
"""Return the relative path (from the content path) to the given
source_path.
If no source path is specified, use the source path of this
content object.
"""
if not source_path:
source_path = self.source_path
if source_path is None:
return None
return posixize_path(
os.path.relpath(
os.path.abspath(os.path.join(
self.settings['PATH'],
source_path)),
os.path.abspath(self.settings['PATH'])
))
@property
def relative_dir(self):
return posixize_path(
os.path.dirname(
os.path.relpath(
os.path.abspath(self.source_path),
os.path.abspath(self.settings['PATH']))))
class Page(Content):
mandatory_properties = ('title',)
allowed_statuses = ('published', 'hidden')
default_status = 'published'
default_template = 'page'
class Article(Content):
mandatory_properties = ('title', 'date', 'category')
allowed_statuses = ('published', 'draft')
default_status = 'published'
default_template = 'article'
def __init__(self, *args, **kwargs):
super(Article, self).__init__(*args, **kwargs)
# handle WITH_FUTURE_DATES (designate article to draft based on date)
if not self.settings['WITH_FUTURE_DATES'] and hasattr(self, 'date'):
if self.date.tzinfo is None:
now = SafeDatetime.now()
else:
now = SafeDatetime.utcnow().replace(tzinfo=pytz.utc)
if self.date > now:
self.status = 'draft'
# if we are a draft and there is no date provided, set max datetime
if not hasattr(self, 'date') and self.status == 'draft':
self.date = SafeDatetime.max
def _expand_settings(self, key):
klass = 'article' if self.status == 'published' else 'draft'
return super(Article, self)._expand_settings(key, klass)
@python_2_unicode_compatible
class Static(Page):
def __init__(self, *args, **kwargs):
super(Static, self).__init__(*args, **kwargs)
self._output_location_referenced = False
@deprecated_attribute(old='filepath', new='source_path', since=(3, 2, 0))
def filepath():
return None
@deprecated_attribute(old='src', new='source_path', since=(3, 2, 0))
def src():
return None
@deprecated_attribute(old='dst', new='save_as', since=(3, 2, 0))
def dst():
return None
@property
def url(self):
# Note when url has been referenced, so we can avoid overriding it.
self._output_location_referenced = True
return super(Static, self).url
@property
def save_as(self):
# Note when save_as has been referenced, so we can avoid overriding it.
self._output_location_referenced = True
return super(Static, self).save_as
def attach_to(self, content):
"""Override our output directory with that of the given content object.
"""
# Determine our file's new output path relative to the linking
# document. If it currently lives beneath the linking
# document's source directory, preserve that relationship on output.
# Otherwise, make it a sibling.
linking_source_dir = os.path.dirname(content.source_path)
tail_path = os.path.relpath(self.source_path, linking_source_dir)
if tail_path.startswith(os.pardir + os.sep):
tail_path = os.path.basename(tail_path)
new_save_as = os.path.join(
os.path.dirname(content.save_as), tail_path)
# We do not build our new url by joining tail_path with the linking
# document's url, because we cannot know just by looking at the latter
# whether it points to the document itself or to its parent directory.
# (An url like 'some/content' might mean a directory named 'some'
# with a file named 'content', or it might mean a directory named
# 'some/content' with a file named 'index.html'.) Rather than trying
# to figure it out by comparing the linking document's url and save_as
# path, we simply build our new url from our new save_as path.
new_url = path_to_url(new_save_as)
def _log_reason(reason):
logger.warning(
"The {attach} link in %s cannot relocate "
"%s because %s. Falling back to "
"{filename} link behavior instead.",
content.get_relative_source_path(),
self.get_relative_source_path(), reason,
extra={'limit_msg': "More {attach} warnings silenced."})
# We never override an override, because we don't want to interfere
# with user-defined overrides that might be in EXTRA_PATH_METADATA.
if hasattr(self, 'override_save_as') or hasattr(self, 'override_url'):
if new_save_as != self.save_as or new_url != self.url:
_log_reason("its output location was already overridden")
return
# We never change an output path that has already been referenced,
# because we don't want to break links that depend on that path.
if self._output_location_referenced:
if new_save_as != self.save_as or new_url != self.url:
_log_reason("another link already referenced its location")
return
self.override_save_as = new_save_as
self.override_url = new_url