1
0
Fork 0
forked from github/pelican

Get HtmlReader to work again

wrote unit tests and documentation, improved regular expression.
The HtmlReader is enabled by default now and parses metadata in html
files of the form:
<!-- key:value -->
This commit is contained in:
Florian Jacob 2012-09-02 10:09:08 +02:00
commit 39db9ddcfd
7 changed files with 72 additions and 15 deletions

View file

@ -171,6 +171,17 @@ Markdown posts should follow this pattern::
This is the content of my super blog post.
Your third option is to write raw html (by ending your file in ``.html``)::
<!-- title: My super title -->
<!-- date: 2010-12-03 10:20 -->
<!-- tags: thats, awesome -->
<!-- category: yeah -->
<p>
This is the content of my super blog post.
</p>
Note that, aside from the title, none of this metadata is mandatory: if the date
is not specified, Pelican will rely on the file's "mtime" timestamp, and the
category can be determined by the directory in which the file resides. For

View file

@ -50,9 +50,9 @@ Setting name (default value) What doe
here or a single string representing one locale.
When providing a list, all the locales will be tried
until one works.
`MARKUP` (``('rst', 'md')``) A list of available markup languages you want
`MARKUP` (``('rst', 'md', 'html')``) A list of available markup languages you want
to use. For the moment, the only available values
are `rst` and `md`.
are `rst`, `md` and `html`.
`MD_EXTENSIONS` (``['codehilite','extra']``) A list of the extensions that the Markdown processor
will use. Refer to the extensions chapter in the
Python-Markdown documentation for a complete list of

View file

@ -142,19 +142,30 @@ class MarkdownReader(Reader):
class HtmlReader(Reader):
file_extensions = ['html', 'htm']
_re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>')
# re.DOTALL and .*? (minimal match of an arbitrary number of characters)
# allow multi-line metadata to be matched correctly
_re = re.compile('<\!--([^\:]*):(.*?)-->', re.DOTALL)
def read(self, filename):
"""Parse content and metadata of (x)HTML files"""
with open(filename) as content:
metadata = {'title': 'unnamed'}
for i in self._re.findall(content):
key = i.split(':')[0][5:].strip()
value = i.split(':')[-1][:-3].strip()
name = key.lower()
metadata[name] = self.process_metadata(name, value)
"""Parse content and metadata of (x)HTML files.
return content, metadata
Matches for metadata tags in the form <!-- name:value -->
Activated when you add 'html' to your MARKUP settings variable
"""
content = open(filename)
metadata = {'title': 'unnamed'}
for comment in self._re.findall(content):
key = comment[0].strip().lower()
value = comment[1].strip()
# remove identation from multi-line metadata
value = re.sub('[ \t]+', ' ', value)
value = re.sub(' ?\n ?', '\n', value)
metadata[key] = self.process_metadata(key, value)
return content, metadata
_EXTENSIONS = {}

View file

@ -21,7 +21,7 @@ _DEFAULT_CONFIG = {'PATH': '.',
'PAGE_EXCLUDES': (),
'THEME': DEFAULT_THEME,
'OUTPUT_PATH': 'output/',
'MARKUP': ('rst', 'md'),
'MARKUP': ('rst', 'md', 'html'),
'STATIC_PATHS': ['images', ],
'THEME_STATIC_PATHS': ['static', ],
'FEED_ATOM': 'feeds/all.atom.xml',

View file

@ -0,0 +1,13 @@
<!-- title: A great html article with metadata -->
<!-- tags: foo, bar, foobar -->
<!-- date: 2010-12-02 10:14 -->
<!-- category: yeah -->
<!-- author: Alexis Métaireau -->
<!-- summary:
Multi-line metadata should be supported
as well as <strong>inline markup</strong>.
-->
<!-- custom_field: http://notmyidea.org -->
<h1>This is an article in html with metadata</h1>
<p>It features very interesting insights.</p>

View file

@ -73,7 +73,8 @@ class TestArticlesGenerator(unittest.TestCase):
[u'This is an article with category !', 'published', 'yeah', 'article'],
[u'This is an article without category !', 'published', 'Default', 'article'],
[u'This is an article without category !', 'published', 'TestCategory', 'article'],
[u'This is a super article !', 'published', 'yeah', 'article']
[u'This is a super article !', 'published', 'yeah', 'article'],
[u'A great html article with metadata', 'published', u'yeah', 'article']
]
self.assertItemsEqual(articles_expected, articles)

View file

@ -90,3 +90,24 @@ class MdReaderTest(unittest.TestCase):
"<p>This is another markdown test file. Uses the mkd extension.</p>"
self.assertEqual(content, expected)
class HtmlReaderTest(unittest.TestCase):
def test_article_with_metadata(self):
reader = readers.HtmlReader({})
content, metadata = reader.read(_filename('article_with_html_metadata.html'))
expected = {
'category': 'yeah',
'author': u'Alexis Métaireau',
'title': 'A great html article with metadata',
'summary': u'Multi-line metadata should be'\
u' supported\nas well as <strong>inline'\
u' markup</strong>.',
'date': datetime.datetime(2010, 12, 2, 10, 14),
'tags': ['foo', 'bar', 'foobar'],
'custom_field': 'http://notmyidea.org',
}
for key, value in expected.items():
self.assertEquals(value, metadata[key], key)