diff --git a/pelican/readers.py b/pelican/readers.py index 83565918..83cb7e3b 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -13,8 +13,11 @@ try: from markdown import Markdown except ImportError: Markdown = False # NOQA +import cgi +from HTMLParser import HTMLParser import re + from pelican.contents import Category, Tag, Author from pelican.utils import get_date, open @@ -126,13 +129,12 @@ class MarkdownReader(Reader): metadata[name] = self.process_metadata(name, value[0]) return content, metadata - +""" class HtmlReader(Reader): file_extensions = ['html', 'htm'] _re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>') def read(self, filename): - """Parse content and metadata of (x)HTML files""" with open(filename) as content: metadata = {'title': 'unnamed'} for i in self._re.findall(content): @@ -142,6 +144,101 @@ class HtmlReader(Reader): metadata[name] = self.process_metadata(name, value) return content, metadata +""" + +class PelicanHTMLParser(HTMLParser): + def __init__(self, settings): + HTMLParser.__init__(self) + self.body = '' + self.metadata = {} + self.settings = settings + + self._data_buffer = '' + + self._in_top_level = True + self._in_head = False + self._in_title = False + self._in_body = False + self._in_tags = False + + def handle_starttag(self, tag, attrs): + if tag == 'head' and self._in_top_level: + self._in_top_level = False + self._in_head = True + elif tag == 'title' and self._in_head: + self._in_title = True + self._data_buffer = '' + elif tag == 'body' and self._in_top_level: + self._in_top_level = False + self._in_body = True + self._data_buffer = '' + elif tag == 'meta' and self._in_head: + self._handle_meta_tag(attrs) + + elif self._in_body: + self._data_buffer += self.build_tag(tag, attrs, False) + + def handle_endtag(self, tag): + if tag == 'head': + if self._in_head: + self._in_head = False + self._in_top_level = True + elif tag == 'title': + self._in_title = False + self.metadata['title'] = self._data_buffer + elif tag == 'body': + self.body = self._data_buffer + self._in_body = False + self._in_top_level = True + elif self._in_body: + self._data_buffer += ''.format(cgi.escape(tag)) + + def handle_startendtag(self, tag, attrs): + if tag == 'meta' and self._in_head: + self._handle_meta_tag(attrs) + if self._in_body: + self._data_buffer += self.build_tag(tag, attrs, True) + + def handle_comment(self, data): + if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': + self.metadata['summary'] = self._data_buffer + + def handle_data(self, data): + self._data_buffer += data + + def build_tag(self, tag, attrs, close_tag): + result = '<{}'.format(cgi.escape(tag)) + result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs)) + if close_tag: + return result + ' />' + return result + '>' + + def _handle_meta_tag(self, attrs): + name = self._attr_value(attrs, 'name') + contents = self._attr_value(attrs, 'contents', '') + if name == 'keywords': + if contents: + self.metadata['tags'] = [Tag(unicode(tag), self.settings) for tag in contents.split(',')] + elif name == 'date': + self.metadata['date'] = get_date(contents) + else: + self.metadata[name] = contents + + @classmethod + def _attr_value(cls, attrs, name, default=None): + return next((x[1] for x in attrs if x[0] == name), default) + +class HTMLReader(Reader): + file_extensions = ['htm', 'html'] + enabled = True + + def read(self, filename): + """Parse content and metadata of markdown files""" + with open(filename) as content: + parser = PelicanHTMLParser(self.settings) + parser.feed(content) + parser.close() + return parser.body, parser.metadata _EXTENSIONS = {}