mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
Merge pull request #382 from mankyd/htmlparser
New, more thorough HTMLParser
This commit is contained in:
commit
f3bc2ece86
9 changed files with 227 additions and 22 deletions
|
|
@ -190,6 +190,36 @@ syntax for Markdown posts should follow this pattern::
|
||||||
|
|
||||||
This is the content of my super blog post.
|
This is the content of my super blog post.
|
||||||
|
|
||||||
|
Lastly, you can use vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican
|
||||||
|
interprets the HTML in a very straightforward manner, reading meta data out
|
||||||
|
of ``meta`` tags, the title out of the ``title`` tag, and the body out of the
|
||||||
|
``body`` tag::
|
||||||
|
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>My super title</title>
|
||||||
|
<meta name="tags" contents="thats, awesome" />
|
||||||
|
<meta name="date" contents="2012-07-09 22:28" />
|
||||||
|
<meta name="category" contents="yeah" />
|
||||||
|
<meta name="author" contents="Alexis Métaireau" />
|
||||||
|
<meta name="summary" contents="Short version for index and feeds" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
This is the content of my super blog post.
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
||||||
|
With HTML, there is one simple exception to the standard metadata.
|
||||||
|
``tags`` can be specified either with the ``tags`` metadata, as is standard in
|
||||||
|
Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can
|
||||||
|
be used interchangeably.
|
||||||
|
|
||||||
|
Note that, aside from the title, none of this metadata is mandatory: if the date
|
||||||
|
is not specified and DEFAULT_DATE is 'fs', Pelican will rely on the file's
|
||||||
|
"mtime" timestamp, and the category can be determined by the directory in which
|
||||||
|
the file resides. For example, a file located at ``python/foobar/myfoobar.rst``
|
||||||
|
will have a category of ``foobar``.
|
||||||
|
|
||||||
Note that, aside from the title, none of this metadata is mandatory: if the
|
Note that, aside from the title, none of this metadata is mandatory: if the
|
||||||
date is not specified, Pelican can rely on the file's "mtime" timestamp through
|
date is not specified, Pelican can rely on the file's "mtime" timestamp through
|
||||||
the ``DEFAULT_DATE`` setting, and the category can be determined by the
|
the ``DEFAULT_DATE`` setting, and the category can be determined by the
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@ The logic is separated into different classes and concepts:
|
||||||
on. Since those operations are commonly used, the object is created once and
|
on. Since those operations are commonly used, the object is created once and
|
||||||
then passed to the generators.
|
then passed to the generators.
|
||||||
|
|
||||||
* **Readers** are used to read from various formats (AsciiDoc, Markdown and
|
* **Readers** are used to read from various formats (AsciiDoc, HTML, Markdown and
|
||||||
reStructuredText for now, but the system is extensible). Given a file, they
|
reStructuredText for now, but the system is extensible). Given a file, they
|
||||||
return metadata (author, tags, category, etc.) and content (HTML-formatted).
|
return metadata (author, tags, category, etc.) and content (HTML-formatted).
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,12 @@ except ImportError:
|
||||||
asciidoc = False
|
asciidoc = False
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import cgi
|
||||||
|
try:
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
except ImportError:
|
||||||
|
from HTMLParser import HTMLParser
|
||||||
|
|
||||||
from pelican.contents import Category, Tag, Author
|
from pelican.contents import Category, Tag, Author
|
||||||
from pelican.utils import get_date, pelican_open
|
from pelican.utils import get_date, pelican_open
|
||||||
|
|
||||||
|
|
@ -154,30 +160,114 @@ class MarkdownReader(Reader):
|
||||||
|
|
||||||
def read(self, source_path):
|
def read(self, source_path):
|
||||||
"""Parse content and metadata of markdown files"""
|
"""Parse content and metadata of markdown files"""
|
||||||
text = pelican_open(source_path)
|
|
||||||
md = Markdown(extensions=set(self.extensions + ['meta']))
|
with pelican_open(source_path) as text:
|
||||||
content = md.convert(text)
|
md = Markdown(extensions=set(self.extensions + ['meta']))
|
||||||
|
content = md.convert(text)
|
||||||
|
|
||||||
metadata = self._parse_metadata(md.Meta)
|
metadata = self._parse_metadata(md.Meta)
|
||||||
return content, metadata
|
return content, metadata
|
||||||
|
|
||||||
|
class HTMLReader(Reader):
|
||||||
|
"""Parses HTML files as input, looking for meta, title, and body tags"""
|
||||||
|
file_extensions = ['htm', 'html']
|
||||||
|
enabled = True
|
||||||
|
|
||||||
class HtmlReader(Reader):
|
class _HTMLParser(HTMLParser):
|
||||||
file_extensions = ['html', 'htm']
|
def __init__(self, settings):
|
||||||
_re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>')
|
HTMLParser.__init__(self)
|
||||||
|
self.body = ''
|
||||||
|
self.metadata = {}
|
||||||
|
self.settings = settings
|
||||||
|
|
||||||
def read(self, source_path):
|
self._data_buffer = ''
|
||||||
"""Parse content and metadata of (x)HTML files"""
|
|
||||||
with open(source_path) as content:
|
|
||||||
metadata = {'title': 'unnamed'}
|
|
||||||
for i in self._re.findall(content):
|
|
||||||
key = i.split(':')[0][5:].strip()
|
|
||||||
value = i.split(':')[-1][:-3].strip()
|
|
||||||
name = key.lower()
|
|
||||||
metadata[name] = self.process_metadata(name, value)
|
|
||||||
|
|
||||||
return content, metadata
|
self._in_top_level = True
|
||||||
|
self._in_head = False
|
||||||
|
self._in_title = False
|
||||||
|
self._in_body = False
|
||||||
|
self._in_tags = False
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag == 'head' and self._in_top_level:
|
||||||
|
self._in_top_level = False
|
||||||
|
self._in_head = True
|
||||||
|
elif tag == 'title' and self._in_head:
|
||||||
|
self._in_title = True
|
||||||
|
self._data_buffer = ''
|
||||||
|
elif tag == 'body' and self._in_top_level:
|
||||||
|
self._in_top_level = False
|
||||||
|
self._in_body = True
|
||||||
|
self._data_buffer = ''
|
||||||
|
elif tag == 'meta' and self._in_head:
|
||||||
|
self._handle_meta_tag(attrs)
|
||||||
|
|
||||||
|
elif self._in_body:
|
||||||
|
self._data_buffer += self.build_tag(tag, attrs, False)
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag == 'head':
|
||||||
|
if self._in_head:
|
||||||
|
self._in_head = False
|
||||||
|
self._in_top_level = True
|
||||||
|
elif tag == 'title':
|
||||||
|
self._in_title = False
|
||||||
|
self.metadata['title'] = self._data_buffer
|
||||||
|
elif tag == 'body':
|
||||||
|
self.body = self._data_buffer
|
||||||
|
self._in_body = False
|
||||||
|
self._in_top_level = True
|
||||||
|
elif self._in_body:
|
||||||
|
self._data_buffer += '</{}>'.format(cgi.escape(tag))
|
||||||
|
|
||||||
|
def handle_startendtag(self, tag, attrs):
|
||||||
|
if tag == 'meta' and self._in_head:
|
||||||
|
self._handle_meta_tag(attrs)
|
||||||
|
if self._in_body:
|
||||||
|
self._data_buffer += self.build_tag(tag, attrs, True)
|
||||||
|
|
||||||
|
def handle_comment(self, data):
|
||||||
|
self._data_buffer += '<!--{}-->'.format(data)
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
self._data_buffer += data
|
||||||
|
|
||||||
|
def handle_entityref(self, data):
|
||||||
|
self._data_buffer += '&{};'.format(data)
|
||||||
|
|
||||||
|
def handle_charref(self, data):
|
||||||
|
self._data_buffer += '&#{};'.format(data)
|
||||||
|
|
||||||
|
def build_tag(self, tag, attrs, close_tag):
|
||||||
|
result = '<{}'.format(cgi.escape(tag))
|
||||||
|
result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs))
|
||||||
|
if close_tag:
|
||||||
|
return result + ' />'
|
||||||
|
return result + '>'
|
||||||
|
|
||||||
|
def _handle_meta_tag(self, attrs):
|
||||||
|
name = self._attr_value(attrs, 'name').lower()
|
||||||
|
contents = self._attr_value(attrs, 'contents', '')
|
||||||
|
|
||||||
|
if name == 'keywords':
|
||||||
|
name = 'tags'
|
||||||
|
self.metadata[name] = contents
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _attr_value(cls, attrs, name, default=None):
|
||||||
|
return next((x[1] for x in attrs if x[0] == name), default)
|
||||||
|
|
||||||
|
def read(self, filename):
|
||||||
|
"""Parse content and metadata of HTML files"""
|
||||||
|
with pelican_open(filename) as content:
|
||||||
|
parser = self._HTMLParser(self.settings)
|
||||||
|
parser.feed(content)
|
||||||
|
parser.close()
|
||||||
|
|
||||||
|
metadata = {}
|
||||||
|
for k in parser.metadata:
|
||||||
|
metadata[k] = self.process_metadata(k, parser.metadata[k])
|
||||||
|
return parser.body, metadata
|
||||||
|
|
||||||
class AsciiDocReader(Reader):
|
class AsciiDocReader(Reader):
|
||||||
enabled = bool(asciidoc)
|
enabled = bool(asciidoc)
|
||||||
|
|
|
||||||
|
|
@ -185,10 +185,16 @@ def get_date(string):
|
||||||
raise ValueError("'%s' is not a valid date" % string)
|
raise ValueError("'%s' is not a valid date" % string)
|
||||||
|
|
||||||
|
|
||||||
def pelican_open(path):
|
class pelican_open(object):
|
||||||
"""Open a file and return it's content"""
|
"""Open a file and return it's content"""
|
||||||
return open(path, encoding='utf-8').read()
|
def __init__(self, filename):
|
||||||
|
self.filename = filename
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
return open(self.filename, encoding='utf-8').read()
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
|
pass
|
||||||
|
|
||||||
def slugify(value):
|
def slugify(value):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
8
tests/content/article_with_comments.html
Normal file
8
tests/content/article_with_comments.html
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
Body content
|
||||||
|
<!-- This comment is included (including extra whitespace) -->
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
6
tests/content/article_with_keywords.html
Normal file
6
tests/content/article_with_keywords.html
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>This is a super article !</title>
|
||||||
|
<meta name="keywords" contents="foo, bar, foobar" />
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
15
tests/content/article_with_metadata.html
Normal file
15
tests/content/article_with_metadata.html
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>This is a super article !</title>
|
||||||
|
<meta name="tags" contents="foo, bar, foobar" />
|
||||||
|
<meta name="date" contents="2010-12-02 10:14" />
|
||||||
|
<meta name="category" contents="yeah" />
|
||||||
|
<meta name="author" contents="Alexis Métaireau" />
|
||||||
|
<meta name="summary" contents="Summary and stuff" />
|
||||||
|
<meta name="custom_field" contents="http://notmyidea.org" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
Multi-line metadata should be supported
|
||||||
|
as well as <strong>inline markup</strong>.
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
6
tests/content/article_with_uppercase_metadata.html
Normal file
6
tests/content/article_with_uppercase_metadata.html
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>This is a super article !</title>
|
||||||
|
<meta name="Category" contents="Yeah" />
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
|
@ -260,3 +260,47 @@ class AdReaderTest(unittest.TestCase):
|
||||||
'<p>version 1.0.42</p>\n'\
|
'<p>version 1.0.42</p>\n'\
|
||||||
'<p>The quick brown fox jumped over the lazy dog’s back.</p>\n'
|
'<p>The quick brown fox jumped over the lazy dog’s back.</p>\n'
|
||||||
self.assertEqual(content, expected)
|
self.assertEqual(content, expected)
|
||||||
|
|
||||||
|
class HTMLReaderTest(unittest.TestCase):
|
||||||
|
def test_article_with_comments(self):
|
||||||
|
reader = readers.HTMLReader({})
|
||||||
|
content, metadata = reader.read(_path('article_with_comments.html'))
|
||||||
|
|
||||||
|
self.assertEquals('''
|
||||||
|
Body content
|
||||||
|
<!-- This comment is included (including extra whitespace) -->
|
||||||
|
''', content)
|
||||||
|
|
||||||
|
def test_article_with_keywords(self):
|
||||||
|
reader = readers.HTMLReader({})
|
||||||
|
content, metadata = reader.read(_path('article_with_keywords.html'))
|
||||||
|
expected = {
|
||||||
|
'tags': ['foo', 'bar', 'foobar'],
|
||||||
|
}
|
||||||
|
|
||||||
|
for key, value in expected.items():
|
||||||
|
self.assertEquals(value, metadata[key], key)
|
||||||
|
|
||||||
|
def test_article_with_metadata(self):
|
||||||
|
reader = readers.HTMLReader({})
|
||||||
|
content, metadata = reader.read(_path('article_with_metadata.html'))
|
||||||
|
expected = {
|
||||||
|
'category': 'yeah',
|
||||||
|
'author': 'Alexis Métaireau',
|
||||||
|
'title': 'This is a super article !',
|
||||||
|
'summary': 'Summary and stuff',
|
||||||
|
'date': datetime.datetime(2010, 12, 2, 10, 14),
|
||||||
|
'tags': ['foo', 'bar', 'foobar'],
|
||||||
|
'custom_field': 'http://notmyidea.org',
|
||||||
|
}
|
||||||
|
|
||||||
|
for key, value in expected.items():
|
||||||
|
self.assertEquals(value, metadata[key], key)
|
||||||
|
|
||||||
|
|
||||||
|
def test_article_metadata_key_lowercase(self):
|
||||||
|
"""Keys of metadata should be lowercase."""
|
||||||
|
reader = readers.HTMLReader({})
|
||||||
|
content, metadata = reader.read(_path('article_with_uppercase_metadata.html'))
|
||||||
|
self.assertIn('category', metadata, "Key should be lowercase.")
|
||||||
|
self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue