1
0
Fork 0
forked from github/pelican
pelican-theme/pelican/readers.py

579 lines
19 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import datetime
import logging
import os
import re
try:
2011-05-10 07:55:30 +06:00
import docutils
import docutils.core
import docutils.io
from docutils.writers.html4css1 import HTMLTranslator
# import the directives to have pygments support
from pelican import rstdirectives # NOQA
except ImportError:
docutils = False
try:
from markdown import Markdown
except ImportError:
Markdown = False # NOQA
try:
from asciidocapi import AsciiDocAPI
asciidoc = True
except ImportError:
asciidoc = False
try:
from html import escape
except ImportError:
from cgi import escape
2013-01-28 22:21:45 -05:00
try:
from html.parser import HTMLParser
except ImportError:
from HTMLParser import HTMLParser
2012-06-20 19:52:17 -04:00
from pelican import signals
from pelican.contents import Page, Category, Tag, Author
from pelican.utils import get_date, pelican_open
METADATA_PROCESSORS = {
'tags': lambda x, y: [Tag(tag, y) for tag in x.split(',')],
'date': lambda x, y: get_date(x),
'status': lambda x, y: x.strip(),
'category': Category,
'author': Author,
}
logger = logging.getLogger(__name__)
class BaseReader(object):
2013-08-04 22:03:37 +02:00
"""Base class to read files.
This class is used to process static files, and it can be inherited for
other types of file. A Reader class must have the following attributes:
- enabled: (boolean) tell if the Reader class is enabled. It
generally depends on the import of some dependency.
- file_extensions: a list of file extensions that the Reader will process.
- extensions: a list of extensions to use in the reader (typical use is
Markdown).
"""
enabled = True
file_extensions = ['static']
extensions = None
2011-05-10 07:55:30 +06:00
def __init__(self, settings):
self.settings = settings
def process_metadata(self, name, value):
if name in METADATA_PROCESSORS:
return METADATA_PROCESSORS[name](value, self.settings)
return value
def read(self, source_path):
"No-op parser"
content = None
metadata = {}
return content, metadata
2011-05-10 07:55:30 +06:00
class _FieldBodyTranslator(HTMLTranslator):
def __init__(self, document):
HTMLTranslator.__init__(self, document)
self.compact_p = None
2011-05-10 07:55:30 +06:00
def astext(self):
return ''.join(self.body)
def visit_field_body(self, node):
pass
def depart_field_body(self, node):
pass
def render_node_to_html(document, node):
visitor = _FieldBodyTranslator(document)
node.walkabout(visitor)
return visitor.astext()
class PelicanHTMLTranslator(HTMLTranslator):
def visit_abbreviation(self, node):
attrs = {}
if node.hasattr('explanation'):
attrs['title'] = node['explanation']
self.body.append(self.starttag(node, 'abbr', '', **attrs))
def depart_abbreviation(self, node):
self.body.append('</abbr>')
def visit_image(self, node):
# set an empty alt if alt is not specified
# avoids that alt is taken from src
node['alt'] = node.get('alt', '')
return HTMLTranslator.visit_image(self, node)
class RstReader(BaseReader):
2013-08-04 22:03:37 +02:00
"""Reader for reStructuredText files"""
2011-05-10 07:55:30 +06:00
enabled = bool(docutils)
file_extensions = ['rst']
def __init__(self, *args, **kwargs):
super(RstReader, self).__init__(*args, **kwargs)
2011-05-10 07:55:30 +06:00
def _parse_metadata(self, document):
"""Return the dict containing document metadata"""
output = {}
for docinfo in document.traverse(docutils.nodes.docinfo):
for element in docinfo.children:
if element.tagname == 'field': # custom fields (e.g. summary)
name_elem, body_elem = element.children
name = name_elem.astext()
if name == 'summary':
value = render_node_to_html(document, body_elem)
else:
value = body_elem.astext()
else: # standard fields (e.g. address)
name = element.tagname
value = element.astext()
2012-03-12 01:33:30 +09:00
name = name.lower()
output[name] = self.process_metadata(name, value)
return output
2011-05-10 07:55:30 +06:00
def _get_publisher(self, source_path):
extra_params = {'initial_header_level': '2',
'syntax_highlight': 'short',
'input_encoding': 'utf-8'}
user_params = self.settings.get('DOCUTILS_SETTINGS')
if user_params:
extra_params.update(user_params)
pub = docutils.core.Publisher(
destination_class=docutils.io.StringOutput)
2011-05-10 07:55:30 +06:00
pub.set_components('standalone', 'restructuredtext', 'html')
pub.writer.translator_class = PelicanHTMLTranslator
2011-05-10 07:55:30 +06:00
pub.process_programmatic_settings(None, extra_params, None)
pub.set_source(source_path=source_path)
2011-05-10 07:55:30 +06:00
pub.publish()
return pub
def read(self, source_path):
2011-05-10 07:55:30 +06:00
"""Parses restructured text"""
pub = self._get_publisher(source_path)
2011-05-10 07:55:30 +06:00
parts = pub.writer.parts
content = parts.get('body')
metadata = self._parse_metadata(pub.document)
metadata.setdefault('title', parts.get('title'))
2011-05-07 20:00:30 +01:00
return content, metadata
2011-05-10 07:55:30 +06:00
class MarkdownReader(BaseReader):
2013-08-04 22:03:37 +02:00
"""Reader for Markdown files"""
enabled = bool(Markdown)
file_extensions = ['md', 'markdown', 'mkd', 'mdown']
def __init__(self, *args, **kwargs):
super(MarkdownReader, self).__init__(*args, **kwargs)
self.extensions = list(self.settings['MD_EXTENSIONS'])
if 'meta' not in self.extensions:
self.extensions.append('meta')
2012-12-11 00:48:47 -05:00
def _parse_metadata(self, meta):
"""Return the dict containing document metadata"""
output = {}
for name, value in meta.items():
name = name.lower()
if name == "summary":
summary_values = "\n".join(value)
# reset the markdown instance to clear any state
self._md.reset()
summary = self._md.convert(summary_values)
2012-12-11 00:48:47 -05:00
output[name] = self.process_metadata(name, summary)
else:
2012-12-11 00:48:47 -05:00
output[name] = self.process_metadata(name, value[0])
return output
def read(self, source_path):
2012-12-11 00:48:47 -05:00
"""Parse content and metadata of markdown files"""
2013-01-28 21:41:42 -05:00
self._md = Markdown(extensions=self.extensions)
2013-01-28 21:41:42 -05:00
with pelican_open(source_path) as text:
content = self._md.convert(text)
2012-12-11 00:48:47 -05:00
metadata = self._parse_metadata(self._md.Meta)
2011-05-07 20:00:30 +01:00
return content, metadata
class HTMLReader(BaseReader):
2012-06-14 23:08:34 -04:00
"""Parses HTML files as input, looking for meta, title, and body tags"""
2013-08-04 22:03:37 +02:00
2012-06-14 23:08:34 -04:00
file_extensions = ['htm', 'html']
enabled = True
2011-02-14 19:10:01 +01:00
2012-06-14 23:08:34 -04:00
class _HTMLParser(HTMLParser):
def __init__(self, settings, filename):
2012-06-14 23:08:34 -04:00
HTMLParser.__init__(self)
self.body = ''
self.metadata = {}
self.settings = settings
2012-06-10 18:27:38 -04:00
self._data_buffer = ''
2012-06-14 23:08:34 -04:00
self._filename = filename
2012-06-14 23:08:34 -04:00
self._in_top_level = True
self._in_head = False
2012-06-10 18:27:38 -04:00
self._in_title = False
self._in_body = False
2012-06-14 23:08:34 -04:00
self._in_tags = False
def handle_starttag(self, tag, attrs):
if tag == 'head' and self._in_top_level:
self._in_top_level = False
self._in_head = True
elif tag == 'title' and self._in_head:
self._in_title = True
self._data_buffer = ''
elif tag == 'body' and self._in_top_level:
self._in_top_level = False
self._in_body = True
self._data_buffer = ''
elif tag == 'meta' and self._in_head:
self._handle_meta_tag(attrs)
elif self._in_body:
self._data_buffer += self.build_tag(tag, attrs, False)
def handle_endtag(self, tag):
if tag == 'head':
if self._in_head:
self._in_head = False
self._in_top_level = True
elif tag == 'title':
self._in_title = False
self.metadata['title'] = self._data_buffer
elif tag == 'body':
self.body = self._data_buffer
self._in_body = False
self._in_top_level = True
elif self._in_body:
self._data_buffer += '</{}>'.format(escape(tag))
2012-06-14 23:08:34 -04:00
def handle_startendtag(self, tag, attrs):
if tag == 'meta' and self._in_head:
self._handle_meta_tag(attrs)
if self._in_body:
self._data_buffer += self.build_tag(tag, attrs, True)
def handle_comment(self, data):
2013-01-28 22:11:06 -05:00
self._data_buffer += '<!--{}-->'.format(data)
2012-06-14 23:08:34 -04:00
def handle_data(self, data):
self._data_buffer += data
2012-06-20 23:19:06 -04:00
def handle_entityref(self, data):
self._data_buffer += '&{};'.format(data)
def handle_charref(self, data):
2012-06-21 09:05:27 -04:00
self._data_buffer += '&#{};'.format(data)
2012-06-14 23:08:34 -04:00
def build_tag(self, tag, attrs, close_tag):
result = '<{}'.format(escape(tag))
for k, v in attrs:
result += ' ' + escape(k)
2013-02-10 11:02:52 -05:00
if v is not None:
result += '="{}"'.format(escape(v))
2012-06-14 23:08:34 -04:00
if close_tag:
return result + ' />'
return result + '>'
def _handle_meta_tag(self, attrs):
name = self._attr_value(attrs, 'name')
if name is None:
attr_serialized = ', '.join(['{}="{}"'.format(k, v) for k, v in attrs])
logger.warning("Meta tag in file %s does not have a 'name' attribute, skipping. Attributes: %s", self._filename, attr_serialized)
return
name = name.lower()
contents = self._attr_value(attrs, 'content', '')
if not contents:
contents = self._attr_value(attrs, 'contents', '')
if contents:
logger.warning("Meta tag attribute 'contents' used in file %s, should be changed to 'content'", self._filename)
2012-06-14 23:08:34 -04:00
if name == 'keywords':
name = 'tags'
2012-06-10 18:27:38 -04:00
self.metadata[name] = contents
2012-06-14 23:08:34 -04:00
@classmethod
def _attr_value(cls, attrs, name, default=None):
return next((x[1] for x in attrs if x[0] == name), default)
2012-06-10 18:27:38 -04:00
def read(self, filename):
2013-01-28 21:46:54 -05:00
"""Parse content and metadata of HTML files"""
2013-01-28 22:11:06 -05:00
with pelican_open(filename) as content:
parser = self._HTMLParser(self.settings, filename)
2012-06-10 18:27:38 -04:00
parser.feed(content)
parser.close()
2011-02-14 19:10:01 +01:00
2012-06-14 23:08:34 -04:00
metadata = {}
for k in parser.metadata:
metadata[k] = self.process_metadata(k, parser.metadata[k])
return parser.body, metadata
class AsciiDocReader(BaseReader):
2013-08-04 22:03:37 +02:00
"""Reader for AsciiDoc files"""
enabled = bool(asciidoc)
file_extensions = ['asc']
default_options = ["--no-header-footer", "-a newline=\\n"]
def read(self, source_path):
"""Parse content and metadata of asciidoc files"""
from cStringIO import StringIO
with pelican_open(source_path) as source:
text = StringIO(source)
content = StringIO()
ad = AsciiDocAPI()
options = self.settings['ASCIIDOC_OPTIONS']
if isinstance(options, (str, unicode)):
options = [m.strip() for m in options.split(',')]
options = self.default_options + options
for o in options:
ad.options(*o.split())
ad.execute(text, content, backend="html4")
content = content.getvalue()
metadata = {}
for name, value in ad.asciidoc.document.attributes.items():
name = name.lower()
metadata[name] = self.process_metadata(name, value)
if 'doctitle' in metadata:
metadata['title'] = metadata['doctitle']
return content, metadata
2011-02-14 19:10:01 +01:00
class Readers(object):
2013-08-04 22:03:37 +02:00
"""Interface for all readers.
This class contains a mapping of file extensions / Reader classes, to know
which Reader class must be used to read a file (based on its extension).
This is customizable both with the 'READERS' setting, and with the
'readers_init' signall for plugins.
2013-08-04 22:03:37 +02:00
"""
# used to warn about missing dependencies only once, at the first
# instanciation of a Readers object.
warn_missing_deps = True
def __init__(self, settings=None):
self.settings = settings or {}
self.readers = {}
self.reader_classes = {}
for cls in [BaseReader] + BaseReader.__subclasses__():
if not cls.enabled:
if self.__class__.warn_missing_deps:
logger.debug('Missing dependencies for {}'
.format(', '.join(cls.file_extensions)))
continue
for ext in cls.file_extensions:
self.reader_classes[ext] = cls
self.__class__.warn_missing_deps = False
if self.settings['READERS']:
self.reader_classes.update(self.settings['READERS'])
signals.readers_init.send(self)
for fmt, reader_class in self.reader_classes.items():
if not reader_class:
continue
self.readers[fmt] = reader_class(self.settings)
@property
def extensions(self):
return self.readers.keys()
def read_file(self, base_path, path, content_class=Page, fmt=None,
context=None, preread_signal=None, preread_sender=None,
context_signal=None, context_sender=None):
"""Return a content object parsed with the given format."""
path = os.path.abspath(os.path.join(base_path, path))
source_path = os.path.relpath(path, base_path)
logger.debug('read file {} -> {}'.format(
source_path, content_class.__name__))
if not fmt:
_, ext = os.path.splitext(os.path.basename(path))
fmt = ext[1:]
if fmt not in self.readers:
raise TypeError(
'Pelican does not know how to parse {}'.format(path))
if preread_signal:
logger.debug('signal {}.send({})'.format(
preread_signal, preread_sender))
preread_signal.send(preread_sender)
reader = self.readers[fmt]
metadata = default_metadata(
settings=self.settings, process=reader.process_metadata)
metadata.update(path_metadata(
full_path=path, source_path=source_path,
settings=self.settings))
metadata.update(parse_path_metadata(
source_path=source_path, settings=self.settings,
process=reader.process_metadata))
content, reader_metadata = reader.read(path)
metadata.update(reader_metadata)
if content:
# find images with empty alt
find_empty_alt(content, path)
# eventually filter the content with typogrify if asked so
if content and self.settings['TYPOGRIFY']:
from typogrify.filters import typogrify
content = typogrify(content)
metadata['title'] = typogrify(metadata['title'])
if context_signal:
logger.debug('signal {}.send({}, <metadata>)'.format(
context_signal, context_sender))
context_signal.send(context_sender, metadata=metadata)
return content_class(content=content, metadata=metadata,
settings=self.settings, source_path=path,
context=context)
def find_empty_alt(content, path):
"""Find images with empty alt
Create warnings for all images with empty alt (up to a certain number),
as they are really likely to be accessibility flaws.
"""
imgs = re.compile(r"""
(?:
# src before alt
<img
[^\>]*
src=(['"])(.*)\1
[^\>]*
alt=(['"])\3
)|(?:
# alt before src
<img
[^\>]*
alt=(['"])\4
[^\>]*
src=(['"])(.*)\5
)
""", re.X)
matches = re.findall(imgs, content)
# find a correct threshold
nb_warnings = 10
if len(matches) == nb_warnings + 1:
nb_warnings += 1 # avoid bad looking case
# print one warning per image with empty alt until threshold
for match in matches[:nb_warnings]:
logger.warning('Empty alt attribute for image {} in {}'.format(
os.path.basename(match[1] + match[5]), path))
# print one warning for the other images with empty alt
if len(matches) > nb_warnings:
logger.warning('{} other images with empty alt attributes'
.format(len(matches) - nb_warnings))
def default_metadata(settings=None, process=None):
metadata = {}
if settings:
if 'DEFAULT_CATEGORY' in settings:
value = settings['DEFAULT_CATEGORY']
if process:
value = process('category', value)
metadata['category'] = value
if 'DEFAULT_DATE' in settings and settings['DEFAULT_DATE'] != 'fs':
metadata['date'] = datetime.datetime(*settings['DEFAULT_DATE'])
return metadata
def path_metadata(full_path, source_path, settings=None):
metadata = {}
if settings:
if settings.get('DEFAULT_DATE', None) == 'fs':
metadata['date'] = datetime.datetime.fromtimestamp(
os.stat(full_path).st_ctime)
metadata.update(settings.get('EXTRA_PATH_METADATA', {}).get(
source_path, {}))
return metadata
def parse_path_metadata(source_path, settings=None, process=None):
"""Extract a metadata dictionary from a file's path
>>> import pprint
>>> settings = {
... 'FILENAME_METADATA': '(?P<slug>[^.]*).*',
... 'PATH_METADATA':
... '(?P<category>[^/]*)/(?P<date>\d{4}-\d{2}-\d{2})/.*',
... }
>>> reader = BaseReader(settings=settings)
>>> metadata = parse_path_metadata(
... source_path='my-cat/2013-01-01/my-slug.html',
... settings=settings,
... process=reader.process_metadata)
>>> pprint.pprint(metadata) # doctest: +ELLIPSIS
{'category': <pelican.urlwrappers.Category object at ...>,
'date': datetime.datetime(2013, 1, 1, 0, 0),
'slug': 'my-slug'}
"""
metadata = {}
dirname, basename = os.path.split(source_path)
base, ext = os.path.splitext(basename)
subdir = os.path.basename(dirname)
if settings:
checks = []
for key, data in [('FILENAME_METADATA', base),
('PATH_METADATA', source_path)]:
checks.append((settings.get(key, None), data))
if settings.get('USE_FOLDER_AS_CATEGORY', None):
checks.insert(0, ('(?P<category>.*)', subdir))
for regexp, data in checks:
if regexp and data:
match = re.match(regexp, data)
if match:
# .items() for py3k compat.
for k, v in match.groupdict().items():
if k not in metadata:
k = k.lower() # metadata must be lowercase
if process:
v = process(k, v)
metadata[k] = v
return metadata