diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 0952c7d9..afea8c01 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -190,6 +190,36 @@ syntax for Markdown posts should follow this pattern:: This is the content of my super blog post. +Lastly, you can use vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican +interprets the HTML in a very straightforward manner, reading meta data out +of ``meta`` tags, the title out of the ``title`` tag, and the body out of the +``body`` tag:: + + + + My super title + + + + + + + + This is the content of my super blog post. + + + +With HTML, there is one simple exception to the standard metadata. +``tags`` can be specified either with the ``tags`` metadata, as is standard in +Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can +be used interchangeably. + +Note that, aside from the title, none of this metadata is mandatory: if the date +is not specified and DEFAULT_DATE is 'fs', Pelican will rely on the file's +"mtime" timestamp, and the category can be determined by the directory in which +the file resides. For example, a file located at ``python/foobar/myfoobar.rst`` +will have a category of ``foobar``. + Note that, aside from the title, none of this metadata is mandatory: if the date is not specified, Pelican can rely on the file's "mtime" timestamp through the ``DEFAULT_DATE`` setting, and the category can be determined by the diff --git a/docs/internals.rst b/docs/internals.rst index cadd300b..704122ba 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -23,8 +23,8 @@ The logic is separated into different classes and concepts: on. Since those operations are commonly used, the object is created once and then passed to the generators. -* **Readers** are used to read from various formats (AsciiDoc, Markdown and - reStructuredText for now, but the system is extensible). Given a file, they +* **Readers** are used to read from various formats (AsciiDoc, HTML, Markdown and + reStructuredText for now, but the system is extensible). Given a file, they return metadata (author, tags, category, etc.) and content (HTML-formatted). * **Generators** generate the different outputs. For instance, Pelican comes with diff --git a/pelican/readers.py b/pelican/readers.py index f53f7350..46db3c96 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -25,6 +25,12 @@ except ImportError: asciidoc = False import re +import cgi +try: + from html.parser import HTMLParser +except ImportError: + from HTMLParser import HTMLParser + from pelican.contents import Category, Tag, Author from pelican.utils import get_date, pelican_open @@ -154,30 +160,114 @@ class MarkdownReader(Reader): def read(self, source_path): """Parse content and metadata of markdown files""" - text = pelican_open(source_path) - md = Markdown(extensions=set(self.extensions + ['meta'])) - content = md.convert(text) + + with pelican_open(source_path) as text: + md = Markdown(extensions=set(self.extensions + ['meta'])) + content = md.convert(text) metadata = self._parse_metadata(md.Meta) return content, metadata +class HTMLReader(Reader): + """Parses HTML files as input, looking for meta, title, and body tags""" + file_extensions = ['htm', 'html'] + enabled = True -class HtmlReader(Reader): - file_extensions = ['html', 'htm'] - _re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>') + class _HTMLParser(HTMLParser): + def __init__(self, settings): + HTMLParser.__init__(self) + self.body = '' + self.metadata = {} + self.settings = settings - def read(self, source_path): - """Parse content and metadata of (x)HTML files""" - with open(source_path) as content: - metadata = {'title': 'unnamed'} - for i in self._re.findall(content): - key = i.split(':')[0][5:].strip() - value = i.split(':')[-1][:-3].strip() - name = key.lower() - metadata[name] = self.process_metadata(name, value) + self._data_buffer = '' - return content, metadata + self._in_top_level = True + self._in_head = False + self._in_title = False + self._in_body = False + self._in_tags = False + def handle_starttag(self, tag, attrs): + if tag == 'head' and self._in_top_level: + self._in_top_level = False + self._in_head = True + elif tag == 'title' and self._in_head: + self._in_title = True + self._data_buffer = '' + elif tag == 'body' and self._in_top_level: + self._in_top_level = False + self._in_body = True + self._data_buffer = '' + elif tag == 'meta' and self._in_head: + self._handle_meta_tag(attrs) + + elif self._in_body: + self._data_buffer += self.build_tag(tag, attrs, False) + + def handle_endtag(self, tag): + if tag == 'head': + if self._in_head: + self._in_head = False + self._in_top_level = True + elif tag == 'title': + self._in_title = False + self.metadata['title'] = self._data_buffer + elif tag == 'body': + self.body = self._data_buffer + self._in_body = False + self._in_top_level = True + elif self._in_body: + self._data_buffer += ''.format(cgi.escape(tag)) + + def handle_startendtag(self, tag, attrs): + if tag == 'meta' and self._in_head: + self._handle_meta_tag(attrs) + if self._in_body: + self._data_buffer += self.build_tag(tag, attrs, True) + + def handle_comment(self, data): + self._data_buffer += ''.format(data) + + def handle_data(self, data): + self._data_buffer += data + + def handle_entityref(self, data): + self._data_buffer += '&{};'.format(data) + + def handle_charref(self, data): + self._data_buffer += '&#{};'.format(data) + + def build_tag(self, tag, attrs, close_tag): + result = '<{}'.format(cgi.escape(tag)) + result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs)) + if close_tag: + return result + ' />' + return result + '>' + + def _handle_meta_tag(self, attrs): + name = self._attr_value(attrs, 'name').lower() + contents = self._attr_value(attrs, 'contents', '') + + if name == 'keywords': + name = 'tags' + self.metadata[name] = contents + + @classmethod + def _attr_value(cls, attrs, name, default=None): + return next((x[1] for x in attrs if x[0] == name), default) + + def read(self, filename): + """Parse content and metadata of HTML files""" + with pelican_open(filename) as content: + parser = self._HTMLParser(self.settings) + parser.feed(content) + parser.close() + + metadata = {} + for k in parser.metadata: + metadata[k] = self.process_metadata(k, parser.metadata[k]) + return parser.body, metadata class AsciiDocReader(Reader): enabled = bool(asciidoc) diff --git a/pelican/utils.py b/pelican/utils.py index cfbc3c23..9780c119 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -185,10 +185,16 @@ def get_date(string): raise ValueError("'%s' is not a valid date" % string) -def pelican_open(path): +class pelican_open(object): """Open a file and return it's content""" - return open(path, encoding='utf-8').read() + def __init__(self, filename): + self.filename = filename + def __enter__(self): + return open(self.filename, encoding='utf-8').read() + + def __exit__(self, exc_type, exc_value, traceback): + pass def slugify(value): """ diff --git a/tests/content/article_with_comments.html b/tests/content/article_with_comments.html new file mode 100644 index 00000000..289e4a66 --- /dev/null +++ b/tests/content/article_with_comments.html @@ -0,0 +1,8 @@ + + + + + Body content + + + diff --git a/tests/content/article_with_keywords.html b/tests/content/article_with_keywords.html new file mode 100644 index 00000000..c869f514 --- /dev/null +++ b/tests/content/article_with_keywords.html @@ -0,0 +1,6 @@ + + + This is a super article ! + + + diff --git a/tests/content/article_with_metadata.html b/tests/content/article_with_metadata.html new file mode 100644 index 00000000..b108ac8a --- /dev/null +++ b/tests/content/article_with_metadata.html @@ -0,0 +1,15 @@ + + + This is a super article ! + + + + + + + + + Multi-line metadata should be supported + as well as inline markup. + + diff --git a/tests/content/article_with_uppercase_metadata.html b/tests/content/article_with_uppercase_metadata.html new file mode 100644 index 00000000..4fe5a9ee --- /dev/null +++ b/tests/content/article_with_uppercase_metadata.html @@ -0,0 +1,6 @@ + + + This is a super article ! + + + diff --git a/tests/test_readers.py b/tests/test_readers.py index f7cf71d9..39bc2067 100644 --- a/tests/test_readers.py +++ b/tests/test_readers.py @@ -260,3 +260,47 @@ class AdReaderTest(unittest.TestCase): '

version 1.0.42

\n'\ '

The quick brown fox jumped over the lazy dog’s back.

\n' self.assertEqual(content, expected) + +class HTMLReaderTest(unittest.TestCase): + def test_article_with_comments(self): + reader = readers.HTMLReader({}) + content, metadata = reader.read(_path('article_with_comments.html')) + + self.assertEquals(''' + Body content + + ''', content) + + def test_article_with_keywords(self): + reader = readers.HTMLReader({}) + content, metadata = reader.read(_path('article_with_keywords.html')) + expected = { + 'tags': ['foo', 'bar', 'foobar'], + } + + for key, value in expected.items(): + self.assertEquals(value, metadata[key], key) + + def test_article_with_metadata(self): + reader = readers.HTMLReader({}) + content, metadata = reader.read(_path('article_with_metadata.html')) + expected = { + 'category': 'yeah', + 'author': 'Alexis Métaireau', + 'title': 'This is a super article !', + 'summary': 'Summary and stuff', + 'date': datetime.datetime(2010, 12, 2, 10, 14), + 'tags': ['foo', 'bar', 'foobar'], + 'custom_field': 'http://notmyidea.org', + } + + for key, value in expected.items(): + self.assertEquals(value, metadata[key], key) + + + def test_article_metadata_key_lowercase(self): + """Keys of metadata should be lowercase.""" + reader = readers.HTMLReader({}) + content, metadata = reader.read(_path('article_with_uppercase_metadata.html')) + self.assertIn('category', metadata, "Key should be lowercase.") + self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.")