diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index 0952c7d9..afea8c01 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -190,6 +190,36 @@ syntax for Markdown posts should follow this pattern::
This is the content of my super blog post.
+Lastly, you can use vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican
+interprets the HTML in a very straightforward manner, reading meta data out
+of ``meta`` tags, the title out of the ``title`` tag, and the body out of the
+``body`` tag::
+
+
+
+ My super title
+
+
+
+
+
+
+
+ This is the content of my super blog post.
+
+
+
+With HTML, there is one simple exception to the standard metadata.
+``tags`` can be specified either with the ``tags`` metadata, as is standard in
+Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can
+be used interchangeably.
+
+Note that, aside from the title, none of this metadata is mandatory: if the date
+is not specified and DEFAULT_DATE is 'fs', Pelican will rely on the file's
+"mtime" timestamp, and the category can be determined by the directory in which
+the file resides. For example, a file located at ``python/foobar/myfoobar.rst``
+will have a category of ``foobar``.
+
Note that, aside from the title, none of this metadata is mandatory: if the
date is not specified, Pelican can rely on the file's "mtime" timestamp through
the ``DEFAULT_DATE`` setting, and the category can be determined by the
diff --git a/docs/internals.rst b/docs/internals.rst
index cadd300b..704122ba 100644
--- a/docs/internals.rst
+++ b/docs/internals.rst
@@ -23,8 +23,8 @@ The logic is separated into different classes and concepts:
on. Since those operations are commonly used, the object is created once and
then passed to the generators.
-* **Readers** are used to read from various formats (AsciiDoc, Markdown and
- reStructuredText for now, but the system is extensible). Given a file, they
+* **Readers** are used to read from various formats (AsciiDoc, HTML, Markdown and
+ reStructuredText for now, but the system is extensible). Given a file, they
return metadata (author, tags, category, etc.) and content (HTML-formatted).
* **Generators** generate the different outputs. For instance, Pelican comes with
diff --git a/pelican/readers.py b/pelican/readers.py
index f53f7350..46db3c96 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -25,6 +25,12 @@ except ImportError:
asciidoc = False
import re
+import cgi
+try:
+ from html.parser import HTMLParser
+except ImportError:
+ from HTMLParser import HTMLParser
+
from pelican.contents import Category, Tag, Author
from pelican.utils import get_date, pelican_open
@@ -154,30 +160,114 @@ class MarkdownReader(Reader):
def read(self, source_path):
"""Parse content and metadata of markdown files"""
- text = pelican_open(source_path)
- md = Markdown(extensions=set(self.extensions + ['meta']))
- content = md.convert(text)
+
+ with pelican_open(source_path) as text:
+ md = Markdown(extensions=set(self.extensions + ['meta']))
+ content = md.convert(text)
metadata = self._parse_metadata(md.Meta)
return content, metadata
+class HTMLReader(Reader):
+ """Parses HTML files as input, looking for meta, title, and body tags"""
+ file_extensions = ['htm', 'html']
+ enabled = True
-class HtmlReader(Reader):
- file_extensions = ['html', 'htm']
- _re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>')
+ class _HTMLParser(HTMLParser):
+ def __init__(self, settings):
+ HTMLParser.__init__(self)
+ self.body = ''
+ self.metadata = {}
+ self.settings = settings
- def read(self, source_path):
- """Parse content and metadata of (x)HTML files"""
- with open(source_path) as content:
- metadata = {'title': 'unnamed'}
- for i in self._re.findall(content):
- key = i.split(':')[0][5:].strip()
- value = i.split(':')[-1][:-3].strip()
- name = key.lower()
- metadata[name] = self.process_metadata(name, value)
+ self._data_buffer = ''
- return content, metadata
+ self._in_top_level = True
+ self._in_head = False
+ self._in_title = False
+ self._in_body = False
+ self._in_tags = False
+ def handle_starttag(self, tag, attrs):
+ if tag == 'head' and self._in_top_level:
+ self._in_top_level = False
+ self._in_head = True
+ elif tag == 'title' and self._in_head:
+ self._in_title = True
+ self._data_buffer = ''
+ elif tag == 'body' and self._in_top_level:
+ self._in_top_level = False
+ self._in_body = True
+ self._data_buffer = ''
+ elif tag == 'meta' and self._in_head:
+ self._handle_meta_tag(attrs)
+
+ elif self._in_body:
+ self._data_buffer += self.build_tag(tag, attrs, False)
+
+ def handle_endtag(self, tag):
+ if tag == 'head':
+ if self._in_head:
+ self._in_head = False
+ self._in_top_level = True
+ elif tag == 'title':
+ self._in_title = False
+ self.metadata['title'] = self._data_buffer
+ elif tag == 'body':
+ self.body = self._data_buffer
+ self._in_body = False
+ self._in_top_level = True
+ elif self._in_body:
+ self._data_buffer += '{}>'.format(cgi.escape(tag))
+
+ def handle_startendtag(self, tag, attrs):
+ if tag == 'meta' and self._in_head:
+ self._handle_meta_tag(attrs)
+ if self._in_body:
+ self._data_buffer += self.build_tag(tag, attrs, True)
+
+ def handle_comment(self, data):
+ self._data_buffer += ''.format(data)
+
+ def handle_data(self, data):
+ self._data_buffer += data
+
+ def handle_entityref(self, data):
+ self._data_buffer += '&{};'.format(data)
+
+ def handle_charref(self, data):
+ self._data_buffer += '{};'.format(data)
+
+ def build_tag(self, tag, attrs, close_tag):
+ result = '<{}'.format(cgi.escape(tag))
+ result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs))
+ if close_tag:
+ return result + ' />'
+ return result + '>'
+
+ def _handle_meta_tag(self, attrs):
+ name = self._attr_value(attrs, 'name').lower()
+ contents = self._attr_value(attrs, 'contents', '')
+
+ if name == 'keywords':
+ name = 'tags'
+ self.metadata[name] = contents
+
+ @classmethod
+ def _attr_value(cls, attrs, name, default=None):
+ return next((x[1] for x in attrs if x[0] == name), default)
+
+ def read(self, filename):
+ """Parse content and metadata of HTML files"""
+ with pelican_open(filename) as content:
+ parser = self._HTMLParser(self.settings)
+ parser.feed(content)
+ parser.close()
+
+ metadata = {}
+ for k in parser.metadata:
+ metadata[k] = self.process_metadata(k, parser.metadata[k])
+ return parser.body, metadata
class AsciiDocReader(Reader):
enabled = bool(asciidoc)
diff --git a/pelican/utils.py b/pelican/utils.py
index cfbc3c23..9780c119 100644
--- a/pelican/utils.py
+++ b/pelican/utils.py
@@ -185,10 +185,16 @@ def get_date(string):
raise ValueError("'%s' is not a valid date" % string)
-def pelican_open(path):
+class pelican_open(object):
"""Open a file and return it's content"""
- return open(path, encoding='utf-8').read()
+ def __init__(self, filename):
+ self.filename = filename
+ def __enter__(self):
+ return open(self.filename, encoding='utf-8').read()
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ pass
def slugify(value):
"""
diff --git a/tests/content/article_with_comments.html b/tests/content/article_with_comments.html
new file mode 100644
index 00000000..289e4a66
--- /dev/null
+++ b/tests/content/article_with_comments.html
@@ -0,0 +1,8 @@
+
+
+
+
+ Body content
+
+
+
diff --git a/tests/content/article_with_keywords.html b/tests/content/article_with_keywords.html
new file mode 100644
index 00000000..c869f514
--- /dev/null
+++ b/tests/content/article_with_keywords.html
@@ -0,0 +1,6 @@
+
+
+ This is a super article !
+
+
+
diff --git a/tests/content/article_with_metadata.html b/tests/content/article_with_metadata.html
new file mode 100644
index 00000000..b108ac8a
--- /dev/null
+++ b/tests/content/article_with_metadata.html
@@ -0,0 +1,15 @@
+
+
+ This is a super article !
+
+
+
+
+
+
+
+
+ Multi-line metadata should be supported
+ as well as inline markup.
+
+
diff --git a/tests/content/article_with_uppercase_metadata.html b/tests/content/article_with_uppercase_metadata.html
new file mode 100644
index 00000000..4fe5a9ee
--- /dev/null
+++ b/tests/content/article_with_uppercase_metadata.html
@@ -0,0 +1,6 @@
+
+
+ This is a super article !
+
+
+
diff --git a/tests/test_readers.py b/tests/test_readers.py
index f7cf71d9..39bc2067 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -260,3 +260,47 @@ class AdReaderTest(unittest.TestCase):
'version 1.0.42
\n'\
'The quick brown fox jumped over the lazy dog’s back.
\n'
self.assertEqual(content, expected)
+
+class HTMLReaderTest(unittest.TestCase):
+ def test_article_with_comments(self):
+ reader = readers.HTMLReader({})
+ content, metadata = reader.read(_path('article_with_comments.html'))
+
+ self.assertEquals('''
+ Body content
+
+ ''', content)
+
+ def test_article_with_keywords(self):
+ reader = readers.HTMLReader({})
+ content, metadata = reader.read(_path('article_with_keywords.html'))
+ expected = {
+ 'tags': ['foo', 'bar', 'foobar'],
+ }
+
+ for key, value in expected.items():
+ self.assertEquals(value, metadata[key], key)
+
+ def test_article_with_metadata(self):
+ reader = readers.HTMLReader({})
+ content, metadata = reader.read(_path('article_with_metadata.html'))
+ expected = {
+ 'category': 'yeah',
+ 'author': 'Alexis Métaireau',
+ 'title': 'This is a super article !',
+ 'summary': 'Summary and stuff',
+ 'date': datetime.datetime(2010, 12, 2, 10, 14),
+ 'tags': ['foo', 'bar', 'foobar'],
+ 'custom_field': 'http://notmyidea.org',
+ }
+
+ for key, value in expected.items():
+ self.assertEquals(value, metadata[key], key)
+
+
+ def test_article_metadata_key_lowercase(self):
+ """Keys of metadata should be lowercase."""
+ reader = readers.HTMLReader({})
+ content, metadata = reader.read(_path('article_with_uppercase_metadata.html'))
+ self.assertIn('category', metadata, "Key should be lowercase.")
+ self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.")