From 39db9ddcfde6199d7b89232d222b2b5a9c3e1e6b Mon Sep 17 00:00:00 2001 From: Florian Jacob Date: Sun, 2 Sep 2012 10:09:08 +0200 Subject: [PATCH] Get HtmlReader to work again wrote unit tests and documentation, improved regular expression. The HtmlReader is enabled by default now and parses metadata in html files of the form: --- docs/getting_started.rst | 11 +++++++ docs/settings.rst | 4 +-- pelican/readers.py | 31 +++++++++++++------ pelican/settings.py | 2 +- tests/content/article_with_html_metadata.html | 13 ++++++++ tests/test_generators.py | 3 +- tests/test_readers.py | 21 +++++++++++++ 7 files changed, 71 insertions(+), 14 deletions(-) create mode 100644 tests/content/article_with_html_metadata.html diff --git a/docs/getting_started.rst b/docs/getting_started.rst index b7cbe951..3f622dee 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -171,6 +171,17 @@ Markdown posts should follow this pattern:: This is the content of my super blog post. +Your third option is to write raw html (by ending your file in ``.html``):: + + + + + + +

+ This is the content of my super blog post. +

+ Note that, aside from the title, none of this metadata is mandatory: if the date is not specified, Pelican will rely on the file's "mtime" timestamp, and the category can be determined by the directory in which the file resides. For diff --git a/docs/settings.rst b/docs/settings.rst index ad08f020..340b2e92 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -50,9 +50,9 @@ Setting name (default value) What doe here or a single string representing one locale. When providing a list, all the locales will be tried until one works. -`MARKUP` (``('rst', 'md')``) A list of available markup languages you want +`MARKUP` (``('rst', 'md', 'html')``) A list of available markup languages you want to use. For the moment, the only available values - are `rst` and `md`. + are `rst`, `md` and `html`. `MD_EXTENSIONS` (``['codehilite','extra']``) A list of the extensions that the Markdown processor will use. Refer to the extensions chapter in the Python-Markdown documentation for a complete list of diff --git a/pelican/readers.py b/pelican/readers.py index e3ea154d..c9ae882a 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -142,19 +142,30 @@ class MarkdownReader(Reader): class HtmlReader(Reader): file_extensions = ['html', 'htm'] - _re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>') + # re.DOTALL and .*? (minimal match of an arbitrary number of characters) + # allow multi-line metadata to be matched correctly + _re = re.compile('<\!--([^\:]*):(.*?)-->', re.DOTALL) def read(self, filename): - """Parse content and metadata of (x)HTML files""" - with open(filename) as content: - metadata = {'title': 'unnamed'} - for i in self._re.findall(content): - key = i.split(':')[0][5:].strip() - value = i.split(':')[-1][:-3].strip() - name = key.lower() - metadata[name] = self.process_metadata(name, value) + """Parse content and metadata of (x)HTML files. - return content, metadata + Matches for metadata tags in the form + Activated when you add 'html' to your MARKUP settings variable + + """ + content = open(filename) + metadata = {'title': 'unnamed'} + for comment in self._re.findall(content): + key = comment[0].strip().lower() + value = comment[1].strip() + + # remove identation from multi-line metadata + value = re.sub('[ \t]+', ' ', value) + value = re.sub(' ?\n ?', '\n', value) + + metadata[key] = self.process_metadata(key, value) + + return content, metadata _EXTENSIONS = {} diff --git a/pelican/settings.py b/pelican/settings.py index 92c68ddc..82caece7 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -21,7 +21,7 @@ _DEFAULT_CONFIG = {'PATH': '.', 'PAGE_EXCLUDES': (), 'THEME': DEFAULT_THEME, 'OUTPUT_PATH': 'output/', - 'MARKUP': ('rst', 'md'), + 'MARKUP': ('rst', 'md', 'html'), 'STATIC_PATHS': ['images', ], 'THEME_STATIC_PATHS': ['static', ], 'FEED_ATOM': 'feeds/all.atom.xml', diff --git a/tests/content/article_with_html_metadata.html b/tests/content/article_with_html_metadata.html new file mode 100644 index 00000000..89ef4789 --- /dev/null +++ b/tests/content/article_with_html_metadata.html @@ -0,0 +1,13 @@ + + + + + + + + +

This is an article in html with metadata

+

It features very interesting insights.

diff --git a/tests/test_generators.py b/tests/test_generators.py index 3a4ea1e3..3c86df92 100644 --- a/tests/test_generators.py +++ b/tests/test_generators.py @@ -73,7 +73,8 @@ class TestArticlesGenerator(unittest.TestCase): [u'This is an article with category !', 'published', 'yeah', 'article'], [u'This is an article without category !', 'published', 'Default', 'article'], [u'This is an article without category !', 'published', 'TestCategory', 'article'], - [u'This is a super article !', 'published', 'yeah', 'article'] + [u'This is a super article !', 'published', 'yeah', 'article'], + [u'A great html article with metadata', 'published', u'yeah', 'article'] ] self.assertItemsEqual(articles_expected, articles) diff --git a/tests/test_readers.py b/tests/test_readers.py index 299aa378..a9437eac 100644 --- a/tests/test_readers.py +++ b/tests/test_readers.py @@ -90,3 +90,24 @@ class MdReaderTest(unittest.TestCase): "

This is another markdown test file. Uses the mkd extension.

" self.assertEqual(content, expected) + + +class HtmlReaderTest(unittest.TestCase): + + def test_article_with_metadata(self): + reader = readers.HtmlReader({}) + content, metadata = reader.read(_filename('article_with_html_metadata.html')) + expected = { + 'category': 'yeah', + 'author': u'Alexis Métaireau', + 'title': 'A great html article with metadata', + 'summary': u'Multi-line metadata should be'\ + u' supported\nas well as inline'\ + u' markup.', + 'date': datetime.datetime(2010, 12, 2, 10, 14), + 'tags': ['foo', 'bar', 'foobar'], + 'custom_field': 'http://notmyidea.org', + } + + for key, value in expected.items(): + self.assertEquals(value, metadata[key], key)