# -*- coding: utf-8 -*- try: import docutils import docutils.core import docutils.io from docutils.writers.html4css1 import HTMLTranslator # import the directives to have pygments support from pelican import rstdirectives # NOQA except ImportError: core = False try: from markdown import Markdown except ImportError: Markdown = False # NOQA import re import cgi from HTMLParser import HTMLParser from pelican.contents import Category, Tag, Author from pelican.utils import get_date, open _METADATA_PROCESSORS = { 'tags': lambda x, y: [Tag(tag, y) for tag in unicode(x).split(',')], 'date': lambda x, y: get_date(x), 'status': lambda x, y: unicode.strip(x), 'category': Category, 'author': Author, } class Reader(object): enabled = True extensions = None def __init__(self, settings): self.settings = settings def process_metadata(self, name, value): if name in _METADATA_PROCESSORS: return _METADATA_PROCESSORS[name](value, self.settings) return value class _FieldBodyTranslator(HTMLTranslator): def __init__(self, document): HTMLTranslator.__init__(self, document) self.compact_p = None def astext(self): return ''.join(self.body) def visit_field_body(self, node): pass def depart_field_body(self, node): pass def render_node_to_html(document, node): visitor = _FieldBodyTranslator(document) node.walkabout(visitor) return visitor.astext() class RstReader(Reader): enabled = bool(docutils) file_extensions = ['rst'] def _parse_metadata(self, document): """Return the dict containing document metadata""" output = {} for docinfo in document.traverse(docutils.nodes.docinfo): for element in docinfo.children: if element.tagname == 'field': # custom fields (e.g. summary) name_elem, body_elem = element.children name = name_elem.astext() if name == 'summary': value = render_node_to_html(document, body_elem) else: value = body_elem.astext() else: # standard fields (e.g. address) name = element.tagname value = element.astext() name = name.lower() output[name] = self.process_metadata(name, value) return output def _get_publisher(self, filename): extra_params = {'initial_header_level': '2'} pub = docutils.core.Publisher( destination_class=docutils.io.StringOutput) pub.set_components('standalone', 'restructuredtext', 'html') pub.process_programmatic_settings(None, extra_params, None) pub.set_source(source_path=filename) pub.publish() return pub def read(self, filename): """Parses restructured text""" pub = self._get_publisher(filename) parts = pub.writer.parts content = parts.get('body') metadata = self._parse_metadata(pub.document) metadata.setdefault('title', parts.get('title')) return content, metadata class MarkdownReader(Reader): enabled = bool(Markdown) file_extensions = ['md', 'markdown', 'mkd'] extensions = ['codehilite', 'extra'] def read(self, filename): """Parse content and metadata of markdown files""" with open(filename) as text: md = Markdown(extensions=set(self.extensions + ['meta'])) content = md.convert(text) metadata = {} for name, value in md.Meta.items(): name = name.lower() metadata[name] = self.process_metadata(name, value[0]) return content, metadata class HTMLReader(Reader): """Parses HTML files as input, looking for meta, title, and body tags""" file_extensions = ['htm', 'html'] enabled = True class _HTMLParser(HTMLParser): def __init__(self, settings): HTMLParser.__init__(self) self.body = '' self.metadata = {} self.settings = settings self._data_buffer = '' self._in_top_level = True self._in_head = False self._in_title = False self._in_body = False self._in_tags = False def handle_starttag(self, tag, attrs): if tag == 'head' and self._in_top_level: self._in_top_level = False self._in_head = True elif tag == 'title' and self._in_head: self._in_title = True self._data_buffer = '' elif tag == 'body' and self._in_top_level: self._in_top_level = False self._in_body = True self._data_buffer = '' elif tag == 'meta' and self._in_head: self._handle_meta_tag(attrs) elif self._in_body: self._data_buffer += self.build_tag(tag, attrs, False) def handle_endtag(self, tag): if tag == 'head': if self._in_head: self._in_head = False self._in_top_level = True elif tag == 'title': self._in_title = False self.metadata['title'] = self._data_buffer elif tag == 'body': self.body = self._data_buffer self._in_body = False self._in_top_level = True elif self._in_body: self._data_buffer += ''.format(cgi.escape(tag)) def handle_startendtag(self, tag, attrs): if tag == 'meta' and self._in_head: self._handle_meta_tag(attrs) if self._in_body: self._data_buffer += self.build_tag(tag, attrs, True) def handle_comment(self, data): if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': self.metadata['summary'] = self._data_buffer else: self._data_buffer += ''.format(data) def handle_data(self, data): self._data_buffer += data def build_tag(self, tag, attrs, close_tag): result = '<{}'.format(cgi.escape(tag)) result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs)) if close_tag: return result + ' />' return result + '>' def _handle_meta_tag(self, attrs): name = self._attr_value(attrs, 'name').lower() contents = self._attr_value(attrs, 'contents', '') if name == 'keywords': name = 'tags' self.metadata[name] = contents @classmethod def _attr_value(cls, attrs, name, default=None): return next((x[1] for x in attrs if x[0] == name), default) def read(self, filename): """Parse content and metadata of markdown files""" with open(filename) as content: parser = self._HTMLParser(self.settings) parser.feed(content) parser.close() metadata = {} for k in parser.metadata: metadata[k] = self.process_metadata(k, parser.metadata[k]) return parser.body, metadata _EXTENSIONS = {} for cls in Reader.__subclasses__(): for ext in cls.file_extensions: _EXTENSIONS[ext] = cls def read_file(filename, fmt=None, settings=None): """Return a reader object using the given format.""" if not fmt: fmt = filename.split('.')[-1] if fmt not in _EXTENSIONS: raise TypeError('Pelican does not know how to parse %s' % filename) reader = _EXTENSIONS[fmt](settings) settings_key = '%s_EXTENSIONS' % fmt.upper() if settings and settings_key in settings: reader.extensions = settings[settings_key] if not reader.enabled: raise ValueError("Missing dependencies for %s" % fmt) content, metadata = reader.read(filename) # eventually filter the content with typogrify if asked so if settings and settings['TYPOGRIFY']: from typogrify import Typogrify content = Typogrify.typogrify(content) metadata['title'] = Typogrify.typogrify(metadata['title']) return content, metadata