From 39db9ddcfde6199d7b89232d222b2b5a9c3e1e6b Mon Sep 17 00:00:00 2001
From: Florian Jacob <fjacob@lavabit.com>
Date: Sun, 2 Sep 2012 10:09:08 +0200
Subject: [PATCH] Get HtmlReader to work again wrote unit tests and
 documentation, improved regular expression. The HtmlReader is enabled by
 default now and parses metadata in html files of the form: <!-- key:value -->

---
 docs/getting_started.rst                      | 11 +++++++
 docs/settings.rst                             |  4 +--
 pelican/readers.py                            | 31 +++++++++++++------
 pelican/settings.py                           |  2 +-
 tests/content/article_with_html_metadata.html | 13 ++++++++
 tests/test_generators.py                      |  3 +-
 tests/test_readers.py                         | 21 +++++++++++++
 7 files changed, 71 insertions(+), 14 deletions(-)
 create mode 100644 tests/content/article_with_html_metadata.html
diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index b7cbe951..3f622dee 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -171,6 +171,17 @@ Markdown posts should follow this pattern::
 
     This is the content of my super blog post.
 
+Your third option is to write raw html (by ending your file in ``.html``)::
+
+    <!-- title: My super title -->
+    <!-- date: 2010-12-03 10:20 -->
+    <!-- tags: thats, awesome -->
+    <!-- category: yeah -->
+
+    <p>
+        This is the content of my super blog post.
+    </p>
+
 Note that, aside from the title, none of this metadata is mandatory: if the date
 is not specified, Pelican will rely on the file's "mtime" timestamp, and the
 category can be determined by the directory in which the file resides. For
diff --git a/docs/settings.rst b/docs/settings.rst
index ad08f020..340b2e92 100644
--- a/docs/settings.rst
+++ b/docs/settings.rst
@@ -50,9 +50,9 @@ Setting name (default value)                                            What doe
                                                                         here or a single string representing one locale.
                                                                         When providing a list, all the locales will be tried
                                                                         until one works.
-`MARKUP` (``('rst', 'md')``)                                            A list of available markup languages you want
+`MARKUP` (``('rst', 'md', 'html')``)                                    A list of available markup languages you want
                                                                         to use. For the moment, the only available values
-                                                                        are `rst` and `md`.
+                                                                        are `rst`, `md` and `html`.
 `MD_EXTENSIONS` (``['codehilite','extra']``)                            A list of the extensions that the Markdown processor
                                                                         will use. Refer to the extensions chapter in the
                                                                         Python-Markdown documentation for a complete list of
diff --git a/pelican/readers.py b/pelican/readers.py
index e3ea154d..c9ae882a 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -142,19 +142,30 @@ class MarkdownReader(Reader):
 
 class HtmlReader(Reader):
     file_extensions = ['html', 'htm']
-    _re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>')
+    # re.DOTALL and .*? (minimal match of an arbitrary number of characters)
+    # allow multi-line metadata to be matched correctly
+    _re = re.compile('<\!--([^\:]*):(.*?)-->', re.DOTALL)
 
     def read(self, filename):
-        """Parse content and metadata of (x)HTML files"""
-        with open(filename) as content:
-            metadata = {'title': 'unnamed'}
-            for i in self._re.findall(content):
-                key = i.split(':')[0][5:].strip()
-                value = i.split(':')[-1][:-3].strip()
-                name = key.lower()
-                metadata[name] = self.process_metadata(name, value)
+        """Parse content and metadata of (x)HTML files.
 
-            return content, metadata
+        Matches for metadata tags in the form <!-- name:value -->
+        Activated when you add 'html' to your MARKUP settings variable
+
+        """
+        content = open(filename)
+        metadata = {'title': 'unnamed'}
+        for comment in self._re.findall(content):
+            key = comment[0].strip().lower()
+            value = comment[1].strip()
+
+            # remove identation from multi-line metadata
+            value = re.sub('[ \t]+', ' ', value)
+            value = re.sub(' ?\n ?', '\n', value)
+
+            metadata[key] = self.process_metadata(key, value)
+
+        return content, metadata
 
 
 _EXTENSIONS = {}
diff --git a/pelican/settings.py b/pelican/settings.py
index 92c68ddc..82caece7 100644
--- a/pelican/settings.py
+++ b/pelican/settings.py
@@ -21,7 +21,7 @@ _DEFAULT_CONFIG = {'PATH': '.',
                    'PAGE_EXCLUDES': (),
                    'THEME': DEFAULT_THEME,
                    'OUTPUT_PATH': 'output/',
-                   'MARKUP': ('rst', 'md'),
+                   'MARKUP': ('rst', 'md', 'html'),
                    'STATIC_PATHS': ['images', ],
                    'THEME_STATIC_PATHS': ['static', ],
                    'FEED_ATOM': 'feeds/all.atom.xml',
diff --git a/tests/content/article_with_html_metadata.html b/tests/content/article_with_html_metadata.html
new file mode 100644
index 00000000..89ef4789
--- /dev/null
+++ b/tests/content/article_with_html_metadata.html
@@ -0,0 +1,13 @@
+<!-- title: A great html article with metadata -->
+<!-- tags: foo, bar, foobar -->
+<!-- date: 2010-12-02 10:14 -->
+<!-- category: yeah -->
+<!-- author: Alexis Métaireau -->
+<!-- summary:
+    Multi-line metadata should be supported
+    as well as <strong>inline markup</strong>.
+    -->
+<!-- custom_field: http://notmyidea.org -->
+
+<h1>This is an article in html with metadata</h1>
+<p>It features very interesting insights.</p>
diff --git a/tests/test_generators.py b/tests/test_generators.py
index 3a4ea1e3..3c86df92 100644
--- a/tests/test_generators.py
+++ b/tests/test_generators.py
@@ -73,7 +73,8 @@ class TestArticlesGenerator(unittest.TestCase):
             [u'This is an article with category !', 'published', 'yeah', 'article'],
             [u'This is an article without category !', 'published', 'Default', 'article'],
             [u'This is an article without category !', 'published', 'TestCategory', 'article'],
-            [u'This is a super article !', 'published', 'yeah', 'article']
+            [u'This is a super article !', 'published', 'yeah', 'article'],
+            [u'A great html article with metadata', 'published', u'yeah', 'article']
         ]
         self.assertItemsEqual(articles_expected, articles)
 
diff --git a/tests/test_readers.py b/tests/test_readers.py
index 299aa378..a9437eac 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -90,3 +90,24 @@ class MdReaderTest(unittest.TestCase):
                 "<p>This is another markdown test file.  Uses the mkd extension.</p>"
         
         self.assertEqual(content, expected)
+
+
+class HtmlReaderTest(unittest.TestCase):
+
+    def test_article_with_metadata(self):
+        reader = readers.HtmlReader({})
+        content, metadata = reader.read(_filename('article_with_html_metadata.html'))
+        expected = {
+            'category': 'yeah',
+            'author': u'Alexis Métaireau',
+            'title': 'A great html article with metadata',
+            'summary': u'Multi-line metadata should be'\
+                       u' supported\nas well as <strong>inline'\
+                       u' markup</strong>.',
+            'date': datetime.datetime(2010, 12, 2, 10, 14),
+            'tags': ['foo', 'bar', 'foobar'],
+            'custom_field': 'http://notmyidea.org',
+        }
+
+        for key, value in expected.items():
+            self.assertEquals(value, metadata[key], key)