Merge pull request #382 from mankyd/htmlparser

New, more thorough HTMLParser
This commit is contained in:
Justin Mayer 2013-02-09 16:48:50 -08:00
commit f3bc2ece86
9 changed files with 227 additions and 22 deletions

View file

@ -190,6 +190,36 @@ syntax for Markdown posts should follow this pattern::
This is the content of my super blog post. This is the content of my super blog post.
Lastly, you can use vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican
interprets the HTML in a very straightforward manner, reading meta data out
of ``meta`` tags, the title out of the ``title`` tag, and the body out of the
``body`` tag::
<html>
<head>
<title>My super title</title>
<meta name="tags" contents="thats, awesome" />
<meta name="date" contents="2012-07-09 22:28" />
<meta name="category" contents="yeah" />
<meta name="author" contents="Alexis Métaireau" />
<meta name="summary" contents="Short version for index and feeds" />
</head>
<body>
This is the content of my super blog post.
</body>
</html>
With HTML, there is one simple exception to the standard metadata.
``tags`` can be specified either with the ``tags`` metadata, as is standard in
Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can
be used interchangeably.
Note that, aside from the title, none of this metadata is mandatory: if the date
is not specified and DEFAULT_DATE is 'fs', Pelican will rely on the file's
"mtime" timestamp, and the category can be determined by the directory in which
the file resides. For example, a file located at ``python/foobar/myfoobar.rst``
will have a category of ``foobar``.
Note that, aside from the title, none of this metadata is mandatory: if the Note that, aside from the title, none of this metadata is mandatory: if the
date is not specified, Pelican can rely on the file's "mtime" timestamp through date is not specified, Pelican can rely on the file's "mtime" timestamp through
the ``DEFAULT_DATE`` setting, and the category can be determined by the the ``DEFAULT_DATE`` setting, and the category can be determined by the

View file

@ -23,7 +23,7 @@ The logic is separated into different classes and concepts:
on. Since those operations are commonly used, the object is created once and on. Since those operations are commonly used, the object is created once and
then passed to the generators. then passed to the generators.
* **Readers** are used to read from various formats (AsciiDoc, Markdown and * **Readers** are used to read from various formats (AsciiDoc, HTML, Markdown and
reStructuredText for now, but the system is extensible). Given a file, they reStructuredText for now, but the system is extensible). Given a file, they
return metadata (author, tags, category, etc.) and content (HTML-formatted). return metadata (author, tags, category, etc.) and content (HTML-formatted).

View file

@ -25,6 +25,12 @@ except ImportError:
asciidoc = False asciidoc = False
import re import re
import cgi
try:
from html.parser import HTMLParser
except ImportError:
from HTMLParser import HTMLParser
from pelican.contents import Category, Tag, Author from pelican.contents import Category, Tag, Author
from pelican.utils import get_date, pelican_open from pelican.utils import get_date, pelican_open
@ -154,30 +160,114 @@ class MarkdownReader(Reader):
def read(self, source_path): def read(self, source_path):
"""Parse content and metadata of markdown files""" """Parse content and metadata of markdown files"""
text = pelican_open(source_path)
md = Markdown(extensions=set(self.extensions + ['meta'])) with pelican_open(source_path) as text:
content = md.convert(text) md = Markdown(extensions=set(self.extensions + ['meta']))
content = md.convert(text)
metadata = self._parse_metadata(md.Meta) metadata = self._parse_metadata(md.Meta)
return content, metadata return content, metadata
class HTMLReader(Reader):
"""Parses HTML files as input, looking for meta, title, and body tags"""
file_extensions = ['htm', 'html']
enabled = True
class HtmlReader(Reader): class _HTMLParser(HTMLParser):
file_extensions = ['html', 'htm'] def __init__(self, settings):
_re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>') HTMLParser.__init__(self)
self.body = ''
self.metadata = {}
self.settings = settings
def read(self, source_path): self._data_buffer = ''
"""Parse content and metadata of (x)HTML files"""
with open(source_path) as content:
metadata = {'title': 'unnamed'}
for i in self._re.findall(content):
key = i.split(':')[0][5:].strip()
value = i.split(':')[-1][:-3].strip()
name = key.lower()
metadata[name] = self.process_metadata(name, value)
return content, metadata self._in_top_level = True
self._in_head = False
self._in_title = False
self._in_body = False
self._in_tags = False
def handle_starttag(self, tag, attrs):
if tag == 'head' and self._in_top_level:
self._in_top_level = False
self._in_head = True
elif tag == 'title' and self._in_head:
self._in_title = True
self._data_buffer = ''
elif tag == 'body' and self._in_top_level:
self._in_top_level = False
self._in_body = True
self._data_buffer = ''
elif tag == 'meta' and self._in_head:
self._handle_meta_tag(attrs)
elif self._in_body:
self._data_buffer += self.build_tag(tag, attrs, False)
def handle_endtag(self, tag):
if tag == 'head':
if self._in_head:
self._in_head = False
self._in_top_level = True
elif tag == 'title':
self._in_title = False
self.metadata['title'] = self._data_buffer
elif tag == 'body':
self.body = self._data_buffer
self._in_body = False
self._in_top_level = True
elif self._in_body:
self._data_buffer += '</{}>'.format(cgi.escape(tag))
def handle_startendtag(self, tag, attrs):
if tag == 'meta' and self._in_head:
self._handle_meta_tag(attrs)
if self._in_body:
self._data_buffer += self.build_tag(tag, attrs, True)
def handle_comment(self, data):
self._data_buffer += '<!--{}-->'.format(data)
def handle_data(self, data):
self._data_buffer += data
def handle_entityref(self, data):
self._data_buffer += '&{};'.format(data)
def handle_charref(self, data):
self._data_buffer += '&#{};'.format(data)
def build_tag(self, tag, attrs, close_tag):
result = '<{}'.format(cgi.escape(tag))
result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs))
if close_tag:
return result + ' />'
return result + '>'
def _handle_meta_tag(self, attrs):
name = self._attr_value(attrs, 'name').lower()
contents = self._attr_value(attrs, 'contents', '')
if name == 'keywords':
name = 'tags'
self.metadata[name] = contents
@classmethod
def _attr_value(cls, attrs, name, default=None):
return next((x[1] for x in attrs if x[0] == name), default)
def read(self, filename):
"""Parse content and metadata of HTML files"""
with pelican_open(filename) as content:
parser = self._HTMLParser(self.settings)
parser.feed(content)
parser.close()
metadata = {}
for k in parser.metadata:
metadata[k] = self.process_metadata(k, parser.metadata[k])
return parser.body, metadata
class AsciiDocReader(Reader): class AsciiDocReader(Reader):
enabled = bool(asciidoc) enabled = bool(asciidoc)

View file

@ -185,10 +185,16 @@ def get_date(string):
raise ValueError("'%s' is not a valid date" % string) raise ValueError("'%s' is not a valid date" % string)
def pelican_open(path): class pelican_open(object):
"""Open a file and return it's content""" """Open a file and return it's content"""
return open(path, encoding='utf-8').read() def __init__(self, filename):
self.filename = filename
def __enter__(self):
return open(self.filename, encoding='utf-8').read()
def __exit__(self, exc_type, exc_value, traceback):
pass
def slugify(value): def slugify(value):
""" """

View file

@ -0,0 +1,8 @@
<html>
<head>
</head>
<body>
Body content
<!-- This comment is included (including extra whitespace) -->
</body>
</html>

View file

@ -0,0 +1,6 @@
<html>
<head>
<title>This is a super article !</title>
<meta name="keywords" contents="foo, bar, foobar" />
</head>
</html>

View file

@ -0,0 +1,15 @@
<html>
<head>
<title>This is a super article !</title>
<meta name="tags" contents="foo, bar, foobar" />
<meta name="date" contents="2010-12-02 10:14" />
<meta name="category" contents="yeah" />
<meta name="author" contents="Alexis Métaireau" />
<meta name="summary" contents="Summary and stuff" />
<meta name="custom_field" contents="http://notmyidea.org" />
</head>
<body>
Multi-line metadata should be supported
as well as <strong>inline markup</strong>.
</body>
</html>

View file

@ -0,0 +1,6 @@
<html>
<head>
<title>This is a super article !</title>
<meta name="Category" contents="Yeah" />
</head>
</html>

View file

@ -260,3 +260,47 @@ class AdReaderTest(unittest.TestCase):
'<p>version 1.0.42</p>\n'\ '<p>version 1.0.42</p>\n'\
'<p>The quick brown fox jumped over the lazy dog&#8217;s back.</p>\n' '<p>The quick brown fox jumped over the lazy dog&#8217;s back.</p>\n'
self.assertEqual(content, expected) self.assertEqual(content, expected)
class HTMLReaderTest(unittest.TestCase):
def test_article_with_comments(self):
reader = readers.HTMLReader({})
content, metadata = reader.read(_path('article_with_comments.html'))
self.assertEquals('''
Body content
<!-- This comment is included (including extra whitespace) -->
''', content)
def test_article_with_keywords(self):
reader = readers.HTMLReader({})
content, metadata = reader.read(_path('article_with_keywords.html'))
expected = {
'tags': ['foo', 'bar', 'foobar'],
}
for key, value in expected.items():
self.assertEquals(value, metadata[key], key)
def test_article_with_metadata(self):
reader = readers.HTMLReader({})
content, metadata = reader.read(_path('article_with_metadata.html'))
expected = {
'category': 'yeah',
'author': 'Alexis Métaireau',
'title': 'This is a super article !',
'summary': 'Summary and stuff',
'date': datetime.datetime(2010, 12, 2, 10, 14),
'tags': ['foo', 'bar', 'foobar'],
'custom_field': 'http://notmyidea.org',
}
for key, value in expected.items():
self.assertEquals(value, metadata[key], key)
def test_article_metadata_key_lowercase(self):
"""Keys of metadata should be lowercase."""
reader = readers.HTMLReader({})
content, metadata = reader.read(_path('article_with_uppercase_metadata.html'))
self.assertIn('category', metadata, "Key should be lowercase.")
self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.")