diff --git a/pelican/readers.py b/pelican/readers.py index 83cb7e3b..9ce3e3c0 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -129,117 +129,101 @@ class MarkdownReader(Reader): metadata[name] = self.process_metadata(name, value[0]) return content, metadata -""" -class HtmlReader(Reader): - file_extensions = ['html', 'htm'] - _re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>') - - def read(self, filename): - with open(filename) as content: - metadata = {'title': 'unnamed'} - for i in self._re.findall(content): - key = i.split(':')[0][5:].strip() - value = i.split(':')[-1][:-3].strip() - name = key.lower() - metadata[name] = self.process_metadata(name, value) - - return content, metadata -""" - -class PelicanHTMLParser(HTMLParser): - def __init__(self, settings): - HTMLParser.__init__(self) - self.body = '' - self.metadata = {} - self.settings = settings - - self._data_buffer = '' - - self._in_top_level = True - self._in_head = False - self._in_title = False - self._in_body = False - self._in_tags = False - - def handle_starttag(self, tag, attrs): - if tag == 'head' and self._in_top_level: - self._in_top_level = False - self._in_head = True - elif tag == 'title' and self._in_head: - self._in_title = True - self._data_buffer = '' - elif tag == 'body' and self._in_top_level: - self._in_top_level = False - self._in_body = True - self._data_buffer = '' - elif tag == 'meta' and self._in_head: - self._handle_meta_tag(attrs) - - elif self._in_body: - self._data_buffer += self.build_tag(tag, attrs, False) - - def handle_endtag(self, tag): - if tag == 'head': - if self._in_head: - self._in_head = False - self._in_top_level = True - elif tag == 'title': - self._in_title = False - self.metadata['title'] = self._data_buffer - elif tag == 'body': - self.body = self._data_buffer - self._in_body = False - self._in_top_level = True - elif self._in_body: - self._data_buffer += '{}>'.format(cgi.escape(tag)) - - def handle_startendtag(self, tag, attrs): - if tag == 'meta' and self._in_head: - self._handle_meta_tag(attrs) - if self._in_body: - self._data_buffer += self.build_tag(tag, attrs, True) - - def handle_comment(self, data): - if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': - self.metadata['summary'] = self._data_buffer - - def handle_data(self, data): - self._data_buffer += data - - def build_tag(self, tag, attrs, close_tag): - result = '<{}'.format(cgi.escape(tag)) - result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs)) - if close_tag: - return result + ' />' - return result + '>' - - def _handle_meta_tag(self, attrs): - name = self._attr_value(attrs, 'name') - contents = self._attr_value(attrs, 'contents', '') - if name == 'keywords': - if contents: - self.metadata['tags'] = [Tag(unicode(tag), self.settings) for tag in contents.split(',')] - elif name == 'date': - self.metadata['date'] = get_date(contents) - else: - self.metadata[name] = contents - - @classmethod - def _attr_value(cls, attrs, name, default=None): - return next((x[1] for x in attrs if x[0] == name), default) - class HTMLReader(Reader): + """Parses HTML files as input, looking for meta, title, and body tags""" file_extensions = ['htm', 'html'] enabled = True + class _HTMLParser(HTMLParser): + def __init__(self, settings): + HTMLParser.__init__(self) + self.body = '' + self.metadata = {} + self.settings = settings + + self._data_buffer = '' + + self._in_top_level = True + self._in_head = False + self._in_title = False + self._in_body = False + self._in_tags = False + + def handle_starttag(self, tag, attrs): + if tag == 'head' and self._in_top_level: + self._in_top_level = False + self._in_head = True + elif tag == 'title' and self._in_head: + self._in_title = True + self._data_buffer = '' + elif tag == 'body' and self._in_top_level: + self._in_top_level = False + self._in_body = True + self._data_buffer = '' + elif tag == 'meta' and self._in_head: + self._handle_meta_tag(attrs) + + elif self._in_body: + self._data_buffer += self.build_tag(tag, attrs, False) + + def handle_endtag(self, tag): + if tag == 'head': + if self._in_head: + self._in_head = False + self._in_top_level = True + elif tag == 'title': + self._in_title = False + self.metadata['title'] = self._data_buffer + elif tag == 'body': + self.body = self._data_buffer + self._in_body = False + self._in_top_level = True + elif self._in_body: + self._data_buffer += '{}>'.format(cgi.escape(tag)) + + def handle_startendtag(self, tag, attrs): + if tag == 'meta' and self._in_head: + self._handle_meta_tag(attrs) + if self._in_body: + self._data_buffer += self.build_tag(tag, attrs, True) + + def handle_comment(self, data): + if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': + self.metadata['summary'] = self._data_buffer + + def handle_data(self, data): + self._data_buffer += data + + def build_tag(self, tag, attrs, close_tag): + result = '<{}'.format(cgi.escape(tag)) + result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs)) + if close_tag: + return result + ' />' + return result + '>' + + def _handle_meta_tag(self, attrs): + name = self._attr_value(attrs, 'name').lower() + contents = self._attr_value(attrs, 'contents', '') + + if name == 'keywords': + name = 'tags' + self.metadata[name] = contents + + @classmethod + def _attr_value(cls, attrs, name, default=None): + return next((x[1] for x in attrs if x[0] == name), default) + def read(self, filename): """Parse content and metadata of markdown files""" with open(filename) as content: - parser = PelicanHTMLParser(self.settings) + parser = self._HTMLParser(self.settings) parser.feed(content) parser.close() - return parser.body, parser.metadata + metadata = {} + for k in parser.metadata: + metadata[k] = self.process_metadata(k, parser.metadata[k]) + return parser.body, metadata _EXTENSIONS = {} diff --git a/tests/content/article_with_keywords.html b/tests/content/article_with_keywords.html new file mode 100644 index 00000000..c869f514 --- /dev/null +++ b/tests/content/article_with_keywords.html @@ -0,0 +1,6 @@ + +
+This is another markdown test file. Uses the mkd extension.
" self.assertEqual(content, expected) + +class HTMLReaderTest(unittest.TestCase): + + def test_article_with_metadata(self): + reader = readers.HTMLReader({}) + content, metadata = reader.read(_filename('article_with_metadata.html')) + expected = { + 'category': 'yeah', + 'author': u'Alexis Métaireau', + 'title': 'This is a super article !', + 'summary': u''' + Multi-line metadata should be supported + as well as inline markup. + ''', + 'date': datetime.datetime(2010, 12, 2, 10, 14), + 'tags': ['foo', 'bar', 'foobar'], + 'custom_field': 'http://notmyidea.org', + } + + for key, value in expected.items(): + self.assertEquals(value, metadata[key], key) + + def test_article_with_keywords(self): + reader = readers.HTMLReader({}) + content, metadata = reader.read(_filename('article_with_keywords.html')) + expected = { + 'tags': ['foo', 'bar', 'foobar'], + } + + for key, value in expected.items(): + self.assertEquals(value, metadata[key], key) + + def test_article_metadata_key_lowercase(self): + """Keys of metadata should be lowercase.""" + reader = readers.HTMLReader({}) + content, metadata = reader.read(_filename('article_with_uppercase_metadata.html')) + self.assertIn('category', metadata, "Key should be lowercase.") + self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.")