# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function import datetime import logging import os import re try: import docutils import docutils.core import docutils.io from docutils.writers.html4css1 import HTMLTranslator # import the directives to have pygments support from pelican import rstdirectives # NOQA except ImportError: docutils = False try: from markdown import Markdown except ImportError: Markdown = False # NOQA try: from asciidocapi import AsciiDocAPI asciidoc = True except ImportError: asciidoc = False try: from html import escape except ImportError: from cgi import escape try: from html.parser import HTMLParser except ImportError: from HTMLParser import HTMLParser from pelican import signals from pelican.contents import Page, Category, Tag, Author from pelican.utils import get_date, pelican_open METADATA_PROCESSORS = { 'tags': lambda x, y: [Tag(tag, y) for tag in x.split(',')], 'date': lambda x, y: get_date(x), 'modified': lambda x, y: get_date(x), 'status': lambda x, y: x.strip(), 'category': Category, 'author': Author, } logger = logging.getLogger(__name__) class BaseReader(object): """Base class to read files. This class is used to process static files, and it can be inherited for other types of file. A Reader class must have the following attributes: - enabled: (boolean) tell if the Reader class is enabled. It generally depends on the import of some dependency. - file_extensions: a list of file extensions that the Reader will process. - extensions: a list of extensions to use in the reader (typical use is Markdown). """ enabled = True file_extensions = ['static'] extensions = None def __init__(self, settings): self.settings = settings def process_metadata(self, name, value): if name in METADATA_PROCESSORS: return METADATA_PROCESSORS[name](value, self.settings) return value def read(self, source_path): "No-op parser" content = None metadata = {} return content, metadata class _FieldBodyTranslator(HTMLTranslator): def __init__(self, document): HTMLTranslator.__init__(self, document) self.compact_p = None def astext(self): return ''.join(self.body) def visit_field_body(self, node): pass def depart_field_body(self, node): pass def render_node_to_html(document, node): visitor = _FieldBodyTranslator(document) node.walkabout(visitor) return visitor.astext() class PelicanHTMLTranslator(HTMLTranslator): def visit_abbreviation(self, node): attrs = {} if node.hasattr('explanation'): attrs['title'] = node['explanation'] self.body.append(self.starttag(node, 'abbr', '', **attrs)) def depart_abbreviation(self, node): self.body.append('') def visit_image(self, node): # set an empty alt if alt is not specified # avoids that alt is taken from src node['alt'] = node.get('alt', '') return HTMLTranslator.visit_image(self, node) class RstReader(BaseReader): """Reader for reStructuredText files""" enabled = bool(docutils) file_extensions = ['rst'] def __init__(self, *args, **kwargs): super(RstReader, self).__init__(*args, **kwargs) def _parse_metadata(self, document): """Return the dict containing document metadata""" output = {} for docinfo in document.traverse(docutils.nodes.docinfo): for element in docinfo.children: if element.tagname == 'field': # custom fields (e.g. summary) name_elem, body_elem = element.children name = name_elem.astext() if name == 'summary': value = render_node_to_html(document, body_elem) else: value = body_elem.astext() else: # standard fields (e.g. address) name = element.tagname value = element.astext() name = name.lower() output[name] = self.process_metadata(name, value) return output def _get_publisher(self, source_path): extra_params = {'initial_header_level': '2', 'syntax_highlight': 'short', 'input_encoding': 'utf-8', 'exit_status_level': 2} user_params = self.settings.get('DOCUTILS_SETTINGS') if user_params: extra_params.update(user_params) pub = docutils.core.Publisher( destination_class=docutils.io.StringOutput) pub.set_components('standalone', 'restructuredtext', 'html') pub.writer.translator_class = PelicanHTMLTranslator pub.process_programmatic_settings(None, extra_params, None) pub.set_source(source_path=source_path) pub.publish(enable_exit_status=True) return pub def read(self, source_path): """Parses restructured text""" pub = self._get_publisher(source_path) parts = pub.writer.parts content = parts.get('body') metadata = self._parse_metadata(pub.document) metadata.setdefault('title', parts.get('title')) return content, metadata class MarkdownReader(BaseReader): """Reader for Markdown files""" enabled = bool(Markdown) file_extensions = ['md', 'markdown', 'mkd', 'mdown'] def __init__(self, *args, **kwargs): super(MarkdownReader, self).__init__(*args, **kwargs) self.extensions = list(self.settings['MD_EXTENSIONS']) if 'meta' not in self.extensions: self.extensions.append('meta') def _parse_metadata(self, meta): """Return the dict containing document metadata""" output = {} for name, value in meta.items(): name = name.lower() if name == "summary": summary_values = "\n".join(value) # reset the markdown instance to clear any state self._md.reset() summary = self._md.convert(summary_values) output[name] = self.process_metadata(name, summary) else: output[name] = self.process_metadata(name, value[0]) return output def read(self, source_path): """Parse content and metadata of markdown files""" self._md = Markdown(extensions=self.extensions) with pelican_open(source_path) as text: content = self._md.convert(text) metadata = self._parse_metadata(self._md.Meta) return content, metadata class HTMLReader(BaseReader): """Parses HTML files as input, looking for meta, title, and body tags""" file_extensions = ['htm', 'html'] enabled = True class _HTMLParser(HTMLParser): def __init__(self, settings, filename): HTMLParser.__init__(self) self.body = '' self.metadata = {} self.settings = settings self._data_buffer = '' self._filename = filename self._in_top_level = True self._in_head = False self._in_title = False self._in_body = False self._in_tags = False def handle_starttag(self, tag, attrs): if tag == 'head' and self._in_top_level: self._in_top_level = False self._in_head = True elif tag == 'title' and self._in_head: self._in_title = True self._data_buffer = '' elif tag == 'body' and self._in_top_level: self._in_top_level = False self._in_body = True self._data_buffer = '' elif tag == 'meta' and self._in_head: self._handle_meta_tag(attrs) elif self._in_body: self._data_buffer += self.build_tag(tag, attrs, False) def handle_endtag(self, tag): if tag == 'head': if self._in_head: self._in_head = False self._in_top_level = True elif tag == 'title': self._in_title = False self.metadata['title'] = self._data_buffer elif tag == 'body': self.body = self._data_buffer self._in_body = False self._in_top_level = True elif self._in_body: self._data_buffer += ''.format(escape(tag)) def handle_startendtag(self, tag, attrs): if tag == 'meta' and self._in_head: self._handle_meta_tag(attrs) if self._in_body: self._data_buffer += self.build_tag(tag, attrs, True) def handle_comment(self, data): self._data_buffer += ''.format(data) def handle_data(self, data): self._data_buffer += data def handle_entityref(self, data): self._data_buffer += '&{};'.format(data) def handle_charref(self, data): self._data_buffer += '&#{};'.format(data) def build_tag(self, tag, attrs, close_tag): result = '<{}'.format(escape(tag)) for k, v in attrs: result += ' ' + escape(k) if v is not None: result += '="{}"'.format(escape(v)) if close_tag: return result + ' />' return result + '>' def _handle_meta_tag(self, attrs): name = self._attr_value(attrs, 'name') if name is None: attr_serialized = ', '.join(['{}="{}"'.format(k, v) for k, v in attrs]) logger.warning("Meta tag in file %s does not have a 'name' attribute, skipping. Attributes: %s", self._filename, attr_serialized) return name = name.lower() contents = self._attr_value(attrs, 'content', '') if not contents: contents = self._attr_value(attrs, 'contents', '') if contents: logger.warning("Meta tag attribute 'contents' used in file %s, should be changed to 'content'", self._filename) if name == 'keywords': name = 'tags' self.metadata[name] = contents @classmethod def _attr_value(cls, attrs, name, default=None): return next((x[1] for x in attrs if x[0] == name), default) def read(self, filename): """Parse content and metadata of HTML files""" with pelican_open(filename) as content: parser = self._HTMLParser(self.settings, filename) parser.feed(content) parser.close() metadata = {} for k in parser.metadata: metadata[k] = self.process_metadata(k, parser.metadata[k]) return parser.body, metadata class AsciiDocReader(BaseReader): """Reader for AsciiDoc files""" enabled = bool(asciidoc) file_extensions = ['asc', 'adoc', 'asciidoc'] default_options = ["--no-header-footer", "-a newline=\\n"] def read(self, source_path): """Parse content and metadata of asciidoc files""" from cStringIO import StringIO with pelican_open(source_path) as source: text = StringIO(source) content = StringIO() ad = AsciiDocAPI() options = self.settings['ASCIIDOC_OPTIONS'] if isinstance(options, (str, unicode)): options = [m.strip() for m in options.split(',')] options = self.default_options + options for o in options: ad.options(*o.split()) ad.execute(text, content, backend="html4") content = content.getvalue() metadata = {} for name, value in ad.asciidoc.document.attributes.items(): name = name.lower() metadata[name] = self.process_metadata(name, value) if 'doctitle' in metadata: metadata['title'] = metadata['doctitle'] return content, metadata class Readers(object): """Interface for all readers. This class contains a mapping of file extensions / Reader classes, to know which Reader class must be used to read a file (based on its extension). This is customizable both with the 'READERS' setting, and with the 'readers_init' signall for plugins. """ # used to warn about missing dependencies only once, at the first # instanciation of a Readers object. warn_missing_deps = True def __init__(self, settings=None): self.settings = settings or {} self.readers = {} self.reader_classes = {} for cls in [BaseReader] + BaseReader.__subclasses__(): if not cls.enabled: if self.__class__.warn_missing_deps: logger.debug('Missing dependencies for {}' .format(', '.join(cls.file_extensions))) continue for ext in cls.file_extensions: self.reader_classes[ext] = cls self.__class__.warn_missing_deps = False if self.settings['READERS']: self.reader_classes.update(self.settings['READERS']) signals.readers_init.send(self) for fmt, reader_class in self.reader_classes.items(): if not reader_class: continue self.readers[fmt] = reader_class(self.settings) @property def extensions(self): return self.readers.keys() def read_file(self, base_path, path, content_class=Page, fmt=None, context=None, preread_signal=None, preread_sender=None, context_signal=None, context_sender=None): """Return a content object parsed with the given format.""" path = os.path.abspath(os.path.join(base_path, path)) source_path = os.path.relpath(path, base_path) logger.debug('read file {} -> {}'.format( source_path, content_class.__name__)) if not fmt: _, ext = os.path.splitext(os.path.basename(path)) fmt = ext[1:] if fmt not in self.readers: raise TypeError( 'Pelican does not know how to parse {}'.format(path)) if preread_signal: logger.debug('signal {}.send({})'.format( preread_signal, preread_sender)) preread_signal.send(preread_sender) reader = self.readers[fmt] metadata = default_metadata( settings=self.settings, process=reader.process_metadata) metadata.update(path_metadata( full_path=path, source_path=source_path, settings=self.settings)) metadata.update(parse_path_metadata( source_path=source_path, settings=self.settings, process=reader.process_metadata)) content, reader_metadata = reader.read(path) metadata.update(reader_metadata) if content: # find images with empty alt find_empty_alt(content, path) # eventually filter the content with typogrify if asked so if content and self.settings['TYPOGRIFY']: from typogrify.filters import typogrify content = typogrify(content) metadata['title'] = typogrify(metadata['title']) if context_signal: logger.debug('signal {}.send({}, )'.format( context_signal, context_sender)) context_signal.send(context_sender, metadata=metadata) return content_class(content=content, metadata=metadata, settings=self.settings, source_path=path, context=context) def find_empty_alt(content, path): """Find images with empty alt Create warnings for all images with empty alt (up to a certain number), as they are really likely to be accessibility flaws. """ imgs = re.compile(r""" (?: # src before alt ]* src=(['"])(.*)\1 [^\>]* alt=(['"])\3 )|(?: # alt before src ]* alt=(['"])\4 [^\>]* src=(['"])(.*)\5 ) """, re.X) matches = re.findall(imgs, content) # find a correct threshold nb_warnings = 10 if len(matches) == nb_warnings + 1: nb_warnings += 1 # avoid bad looking case # print one warning per image with empty alt until threshold for match in matches[:nb_warnings]: logger.warning('Empty alt attribute for image {} in {}'.format( os.path.basename(match[1] + match[5]), path)) # print one warning for the other images with empty alt if len(matches) > nb_warnings: logger.warning('{} other images with empty alt attributes' .format(len(matches) - nb_warnings)) def default_metadata(settings=None, process=None): metadata = {} if settings: if 'DEFAULT_CATEGORY' in settings: value = settings['DEFAULT_CATEGORY'] if process: value = process('category', value) metadata['category'] = value if 'DEFAULT_DATE' in settings and settings['DEFAULT_DATE'] != 'fs': metadata['date'] = datetime.datetime(*settings['DEFAULT_DATE']) return metadata def path_metadata(full_path, source_path, settings=None): metadata = {} if settings: if settings.get('DEFAULT_DATE', None) == 'fs': metadata['date'] = datetime.datetime.fromtimestamp( os.stat(full_path).st_ctime) metadata.update(settings.get('EXTRA_PATH_METADATA', {}).get( source_path, {})) return metadata def parse_path_metadata(source_path, settings=None, process=None): """Extract a metadata dictionary from a file's path >>> import pprint >>> settings = { ... 'FILENAME_METADATA': '(?P[^.]*).*', ... 'PATH_METADATA': ... '(?P[^/]*)/(?P\d{4}-\d{2}-\d{2})/.*', ... } >>> reader = BaseReader(settings=settings) >>> metadata = parse_path_metadata( ... source_path='my-cat/2013-01-01/my-slug.html', ... settings=settings, ... process=reader.process_metadata) >>> pprint.pprint(metadata) # doctest: +ELLIPSIS {'category': , 'date': datetime.datetime(2013, 1, 1, 0, 0), 'slug': 'my-slug'} """ metadata = {} dirname, basename = os.path.split(source_path) base, ext = os.path.splitext(basename) subdir = os.path.basename(dirname) if settings: checks = [] for key, data in [('FILENAME_METADATA', base), ('PATH_METADATA', source_path)]: checks.append((settings.get(key, None), data)) if settings.get('USE_FOLDER_AS_CATEGORY', None): checks.insert(0, ('(?P.*)', subdir)) for regexp, data in checks: if regexp and data: match = re.match(regexp, data) if match: # .items() for py3k compat. for k, v in match.groupdict().items(): if k not in metadata: k = k.lower() # metadata must be lowercase if process: v = process(k, v) metadata[k] = v return metadata