From cc1988fbda5f191768b9d20ef0f942b572d0bb39 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Thu, 14 Jun 2012 23:08:34 -0400
Subject: [PATCH 01/19] new HTMLReader

---
 pelican/readers.py                            | 186 ++++++++----------
 tests/content/article_with_keywords.html      |   6 +
 tests/content/article_with_metadata.html      |  15 ++
 .../article_with_uppercase_metadata.html      |   6 +
 tests/test_readers.py                         |  38 ++++
 5 files changed, 150 insertions(+), 101 deletions(-)
 create mode 100644 tests/content/article_with_keywords.html
 create mode 100644 tests/content/article_with_metadata.html
 create mode 100644 tests/content/article_with_uppercase_metadata.html

diff --git a/pelican/readers.py b/pelican/readers.py
index 83cb7e3b..9ce3e3c0 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -129,117 +129,101 @@ class MarkdownReader(Reader):
             metadata[name] = self.process_metadata(name, value[0])
         return content, metadata
 
-"""
-class HtmlReader(Reader):
-    file_extensions = ['html', 'htm']
-    _re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>')
-
-    def read(self, filename):
-        with open(filename) as content:
-            metadata = {'title': 'unnamed'}
-            for i in self._re.findall(content):
-                key = i.split(':')[0][5:].strip()
-                value = i.split(':')[-1][:-3].strip()
-                name = key.lower()
-                metadata[name] = self.process_metadata(name, value)
-
-            return content, metadata
-"""
-
-class PelicanHTMLParser(HTMLParser):
-    def __init__(self, settings):
-        HTMLParser.__init__(self)
-        self.body = ''
-        self.metadata = {}
-        self.settings = settings
-
-        self._data_buffer = ''
-
-        self._in_top_level = True
-        self._in_head = False
-        self._in_title = False
-        self._in_body = False
-        self._in_tags = False
-
-    def handle_starttag(self, tag, attrs):
-        if tag == 'head' and self._in_top_level:
-            self._in_top_level = False
-            self._in_head = True
-        elif tag == 'title' and self._in_head:
-            self._in_title = True
-            self._data_buffer = ''
-        elif tag == 'body' and self._in_top_level:
-            self._in_top_level = False
-            self._in_body = True
-            self._data_buffer = ''
-        elif tag == 'meta' and self._in_head:
-            self._handle_meta_tag(attrs)
-
-        elif self._in_body:
-            self._data_buffer += self.build_tag(tag, attrs, False)
-            
-    def handle_endtag(self, tag):
-        if tag == 'head':
-            if self._in_head:
-                self._in_head = False
-                self._in_top_level = True
-        elif tag == 'title':
-            self._in_title = False
-            self.metadata['title'] = self._data_buffer
-        elif tag == 'body':
-            self.body = self._data_buffer
-            self._in_body = False
-            self._in_top_level = True
-        elif self._in_body:
-            self._data_buffer += '</{}>'.format(cgi.escape(tag))
-
-    def handle_startendtag(self, tag, attrs):
-        if tag == 'meta' and self._in_head:
-            self._handle_meta_tag(attrs)
-        if self._in_body:
-            self._data_buffer += self.build_tag(tag, attrs, True)
-
-    def handle_comment(self, data):
-        if self._in_body and data.strip() == 'PELICAN_END_SUMMARY':
-            self.metadata['summary'] = self._data_buffer
-
-    def handle_data(self, data):
-        self._data_buffer += data
-
-    def build_tag(self, tag, attrs, close_tag):
-        result = '<{}'.format(cgi.escape(tag))
-        result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs))
-        if close_tag:
-            return result + ' />'
-        return result + '>'
-
-    def _handle_meta_tag(self, attrs):
-        name = self._attr_value(attrs, 'name')
-        contents = self._attr_value(attrs, 'contents', '')
-        if name == 'keywords':
-            if contents:
-                self.metadata['tags'] = [Tag(unicode(tag), self.settings) for tag in contents.split(',')]
-        elif name == 'date':
-            self.metadata['date'] = get_date(contents)
-        else:
-            self.metadata[name] = contents
-
-    @classmethod
-    def _attr_value(cls, attrs, name, default=None):
-        return next((x[1] for x in attrs if x[0] == name), default)
-
 class HTMLReader(Reader):
+    """Parses HTML files as input, looking for meta, title, and body tags"""
     file_extensions = ['htm', 'html']
     enabled = True
 
+    class _HTMLParser(HTMLParser):
+        def __init__(self, settings):
+            HTMLParser.__init__(self)
+            self.body = ''
+            self.metadata = {}
+            self.settings = settings
+
+            self._data_buffer = ''
+
+            self._in_top_level = True
+            self._in_head = False
+            self._in_title = False
+            self._in_body = False
+            self._in_tags = False
+
+        def handle_starttag(self, tag, attrs):
+            if tag == 'head' and self._in_top_level:
+                self._in_top_level = False
+                self._in_head = True
+            elif tag == 'title' and self._in_head:
+                self._in_title = True
+                self._data_buffer = ''
+            elif tag == 'body' and self._in_top_level:
+                self._in_top_level = False
+                self._in_body = True
+                self._data_buffer = ''
+            elif tag == 'meta' and self._in_head:
+                self._handle_meta_tag(attrs)
+
+            elif self._in_body:
+                self._data_buffer += self.build_tag(tag, attrs, False)
+
+        def handle_endtag(self, tag):
+            if tag == 'head':
+                if self._in_head:
+                    self._in_head = False
+                    self._in_top_level = True
+            elif tag == 'title':
+                self._in_title = False
+                self.metadata['title'] = self._data_buffer
+            elif tag == 'body':
+                self.body = self._data_buffer
+                self._in_body = False
+                self._in_top_level = True
+            elif self._in_body:
+                self._data_buffer += '</{}>'.format(cgi.escape(tag))
+
+        def handle_startendtag(self, tag, attrs):
+            if tag == 'meta' and self._in_head:
+                self._handle_meta_tag(attrs)
+            if self._in_body:
+                self._data_buffer += self.build_tag(tag, attrs, True)
+
+        def handle_comment(self, data):
+            if self._in_body and data.strip() == 'PELICAN_END_SUMMARY':
+                self.metadata['summary'] = self._data_buffer
+
+        def handle_data(self, data):
+            self._data_buffer += data
+
+        def build_tag(self, tag, attrs, close_tag):
+            result = '<{}'.format(cgi.escape(tag))
+            result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs))
+            if close_tag:
+                return result + ' />'
+            return result + '>'
+
+        def _handle_meta_tag(self, attrs):
+            name = self._attr_value(attrs, 'name').lower()
+            contents = self._attr_value(attrs, 'contents', '')
+
+            if name == 'keywords':
+                name = 'tags'
+            self.metadata[name] = contents
+
+        @classmethod
+        def _attr_value(cls, attrs, name, default=None):
+            return next((x[1] for x in attrs if x[0] == name), default)
+
     def read(self, filename):
         """Parse content and metadata of markdown files"""
         with open(filename) as content:
-            parser = PelicanHTMLParser(self.settings)
+            parser = self._HTMLParser(self.settings)
             parser.feed(content)
             parser.close()
-        return parser.body, parser.metadata
 
+        metadata = {}
+        for k in parser.metadata:
+            metadata[k] = self.process_metadata(k, parser.metadata[k])
+        return parser.body, metadata
 
 _EXTENSIONS = {}
 
diff --git a/tests/content/article_with_keywords.html b/tests/content/article_with_keywords.html
new file mode 100644
index 00000000..c869f514
--- /dev/null
+++ b/tests/content/article_with_keywords.html
@@ -0,0 +1,6 @@
+<html>
+    <head>
+        <title>This is a super article !</title>
+        <meta name="keywords" contents="foo, bar, foobar" />
+    </head>
+</html>
diff --git a/tests/content/article_with_metadata.html b/tests/content/article_with_metadata.html
new file mode 100644
index 00000000..2bd77241
--- /dev/null
+++ b/tests/content/article_with_metadata.html
@@ -0,0 +1,15 @@
+<html>
+    <head>
+        <title>This is a super article !</title>
+        <meta name="tags" contents="foo, bar, foobar" />
+        <meta name="date" contents="2010-12-02 10:14" />
+        <meta name="category" contents="yeah" />
+        <meta name="author" contents="Alexis Métaireau" />
+        <meta name="custom_field" contents="http://notmyidea.org" />
+    </head>
+    <body>
+        Multi-line metadata should be supported
+        as well as <strong>inline markup</strong>.
+        <!-- PELICAN_END_SUMMARY -->
+    </body>
+</html>
diff --git a/tests/content/article_with_uppercase_metadata.html b/tests/content/article_with_uppercase_metadata.html
new file mode 100644
index 00000000..4fe5a9ee
--- /dev/null
+++ b/tests/content/article_with_uppercase_metadata.html
@@ -0,0 +1,6 @@
+<html>
+    <head>
+        <title>This is a super article !</title>
+        <meta name="Category" contents="Yeah" />
+    </head>
+</html>
diff --git a/tests/test_readers.py b/tests/test_readers.py
index a921cfc2..52887068 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -86,3 +86,41 @@ class MdReaderTest(unittest.TestCase):
                 "<p>This is another markdown test file.  Uses the mkd extension.</p>"
         
         self.assertEqual(content, expected)
+
+class HTMLReaderTest(unittest.TestCase):
+
+    def test_article_with_metadata(self):
+        reader = readers.HTMLReader({})
+        content, metadata = reader.read(_filename('article_with_metadata.html'))
+        expected = {
+            'category': 'yeah',
+            'author': u'Alexis Métaireau',
+            'title': 'This is a super article !',
+            'summary': u'''
+        Multi-line metadata should be supported
+        as well as <strong>inline markup</strong>.
+        ''',
+            'date': datetime.datetime(2010, 12, 2, 10, 14),
+            'tags': ['foo', 'bar', 'foobar'],
+            'custom_field': 'http://notmyidea.org',
+        }
+
+        for key, value in expected.items():
+            self.assertEquals(value, metadata[key], key)
+
+    def test_article_with_keywords(self):
+        reader = readers.HTMLReader({})
+        content, metadata = reader.read(_filename('article_with_keywords.html'))
+        expected = {
+            'tags': ['foo', 'bar', 'foobar'],
+        }
+
+        for key, value in expected.items():
+            self.assertEquals(value, metadata[key], key)
+
+    def test_article_metadata_key_lowercase(self):
+        """Keys of metadata should be lowercase."""
+        reader = readers.HTMLReader({})
+        content, metadata = reader.read(_filename('article_with_uppercase_metadata.html'))
+        self.assertIn('category', metadata, "Key should be lowercase.")
+        self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.")

From 0373c15e430e168928b645be3b9513f093b97403 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Thu, 14 Jun 2012 23:16:27 -0400
Subject: [PATCH 02/19] include html comments properly in reader

---
 pelican/readers.py                       |  2 ++
 tests/content/article_with_comments.html |  7 +++++
 tests/test_readers.py                    | 36 ++++++++++++++++++------
 3 files changed, 36 insertions(+), 9 deletions(-)
 create mode 100644 tests/content/article_with_comments.html

diff --git a/pelican/readers.py b/pelican/readers.py
index 9ce3e3c0..e3d0e0dd 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -190,6 +190,8 @@ class HTMLReader(Reader):
         def handle_comment(self, data):
             if self._in_body and data.strip() == 'PELICAN_END_SUMMARY':
                 self.metadata['summary'] = self._data_buffer
+            else:
+                self._data_buffer += '<!--{}-->'.format(data)
 
         def handle_data(self, data):
             self._data_buffer += data
diff --git a/tests/content/article_with_comments.html b/tests/content/article_with_comments.html
new file mode 100644
index 00000000..f222682d
--- /dev/null
+++ b/tests/content/article_with_comments.html
@@ -0,0 +1,7 @@
+<html>
+    <body>
+        Summary comment is not included.
+        <!-- PELICAN_END_SUMMARY -->
+        <!--  But this comment is (including extra whitespace)    -->
+    </body>
+</html>
diff --git a/tests/test_readers.py b/tests/test_readers.py
index 52887068..b3e30bfc 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -88,6 +88,33 @@ class MdReaderTest(unittest.TestCase):
         self.assertEqual(content, expected)
 
 class HTMLReaderTest(unittest.TestCase):
+    def test_article_with_comments(self):
+        reader = readers.HTMLReader({})
+        content, metadata = reader.read(_filename('article_with_comments.html'))
+        expected = {
+            'summary': '''
+        Summary comment is not included.
+        ''',
+        }
+
+        for key, value in expected.items():
+            self.assertEquals(value, metadata[key], key)
+
+        self.assertEquals('''
+        Summary comment is not included.
+        
+        <!--  But this comment is (including extra whitespace)    -->
+    ''', content)
+
+    def test_article_with_keywords(self):
+        reader = readers.HTMLReader({})
+        content, metadata = reader.read(_filename('article_with_keywords.html'))
+        expected = {
+            'tags': ['foo', 'bar', 'foobar'],
+        }
+
+        for key, value in expected.items():
+            self.assertEquals(value, metadata[key], key)
 
     def test_article_with_metadata(self):
         reader = readers.HTMLReader({})
@@ -108,15 +135,6 @@ class HTMLReaderTest(unittest.TestCase):
         for key, value in expected.items():
             self.assertEquals(value, metadata[key], key)
 
-    def test_article_with_keywords(self):
-        reader = readers.HTMLReader({})
-        content, metadata = reader.read(_filename('article_with_keywords.html'))
-        expected = {
-            'tags': ['foo', 'bar', 'foobar'],
-        }
-
-        for key, value in expected.items():
-            self.assertEquals(value, metadata[key], key)
 
     def test_article_metadata_key_lowercase(self):
         """Keys of metadata should be lowercase."""

From c608d39aa40b8304f4e2e241564796201e582da4 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Wed, 20 Jun 2012 19:52:17 -0400
Subject: [PATCH 03/19] re-import htmlparser

---
 pelican/readers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pelican/readers.py b/pelican/readers.py
index 870c11c8..1916fa1e 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -15,6 +15,8 @@ except ImportError:
     Markdown = False  # NOQA
 import re
 
+from htmlparser import HTMLParser
+
 from pelican.contents import Category, Tag, Author
 from pelican.utils import get_date, open
 

From caa4442abb145d419a3120c7339ad7ecf91ac56c Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Wed, 20 Jun 2012 19:59:32 -0400
Subject: [PATCH 04/19] re-import cgi. properly turn utils.open into a context
 manager

---
 pelican/readers.py | 3 ++-
 pelican/utils.py   | 9 +++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pelican/readers.py b/pelican/readers.py
index 1916fa1e..d05ab40f 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -15,7 +15,8 @@ except ImportError:
     Markdown = False  # NOQA
 import re
 
-from htmlparser import HTMLParser
+import cgi
+from HTMLParser import HTMLParser
 
 from pelican.contents import Category, Tag, Author
 from pelican.utils import get_date, open
diff --git a/pelican/utils.py b/pelican/utils.py
index 0940bf72..088a8faa 100644
--- a/pelican/utils.py
+++ b/pelican/utils.py
@@ -34,10 +34,15 @@ def get_date(string):
     raise ValueError("'%s' is not a valid date" % string)
 
 
-def open(filename):
+class open(object):
     """Open a file and return it's content"""
-    return _open(filename, encoding='utf-8').read()
+    def __init__(self, filename):
+        self.filename = filename
+    def __enter__(self):
+        return _open(self.filename, encoding='utf-8').read()
 
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
 
 def slugify(value):
     """

From 56800a1d43ff9e07659d0f5ad570a9004d44cd74 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Wed, 20 Jun 2012 20:02:41 -0400
Subject: [PATCH 05/19] fix failing test with new open context manager

---
 pelican/readers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pelican/readers.py b/pelican/readers.py
index d05ab40f..1d06bd6d 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -119,9 +119,9 @@ class MarkdownReader(Reader):
 
     def read(self, filename):
         """Parse content and metadata of markdown files"""
-        text = open(filename)
-        md = Markdown(extensions=set(self.extensions + ['meta']))
-        content = md.convert(text)
+        with open(filename) as text:
+            md = Markdown(extensions=set(self.extensions + ['meta']))
+            content = md.convert(text)
 
         metadata = {}
         for name, value in md.Meta.items():

From c0578eb9ab77c7be4a045f58a7844222ccbe6b95 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Wed, 20 Jun 2012 23:19:06 -0400
Subject: [PATCH 06/19] handle escaped chars in html properly

---
 pelican/readers.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pelican/readers.py b/pelican/readers.py
index 1d06bd6d..08ef4cf8 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -196,6 +196,12 @@ class HTMLReader(Reader):
         def handle_data(self, data):
             self._data_buffer += data
 
+        def handle_entityref(self, data):
+            self._data_buffer += '&{};'.format(data)
+
+        def handle_charref(self, data):
+            self._data_buffer += '&{};'.format(data)
+            
         def build_tag(self, tag, attrs, close_tag):
             result = '<{}'.format(cgi.escape(tag))
             result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs))

From 036728a194695d463123c714954c25a3d6a826d5 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Thu, 21 Jun 2012 09:05:27 -0400
Subject: [PATCH 07/19] properly write out charref's

---
 pelican/readers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pelican/readers.py b/pelican/readers.py
index 08ef4cf8..93549d96 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -200,7 +200,7 @@ class HTMLReader(Reader):
             self._data_buffer += '&{};'.format(data)
 
         def handle_charref(self, data):
-            self._data_buffer += '&{};'.format(data)
+            self._data_buffer += '&#{};'.format(data)
             
         def build_tag(self, tag, attrs, close_tag):
             result = '<{}'.format(cgi.escape(tag))

From 847a6fe3cee7f05e36679d6b12fafaf58cfc1045 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Thu, 21 Jun 2012 09:12:38 -0400
Subject: [PATCH 08/19] change 'markdown' to HTML in the comments

---
 pelican/readers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pelican/readers.py b/pelican/readers.py
index 93549d96..9d200599 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -222,7 +222,7 @@ class HTMLReader(Reader):
             return next((x[1] for x in attrs if x[0] == name), default)
 
     def read(self, filename):
-        """Parse content and metadata of markdown files"""
+        """Parse content and metadata of HTML files"""
         with open(filename) as content:
             parser = self._HTMLParser(self.settings)
             parser.feed(content)

From a86d5fda71a2d2ce7295cb385641331b139bf361 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Mon, 9 Jul 2012 22:43:51 -0400
Subject: [PATCH 09/19] add documentation for html reader

---
 docs/getting_started.rst | 30 ++++++++++++++++++++++++++++++
 docs/internals.rst       |  2 +-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index 93d578a0..d60cce83 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -154,6 +154,36 @@ Markdown posts should follow this pattern::
 
     This is the content of my super blog post.
 
+Lastly, you can use Vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican 
+interprets the HTML in a very straightforward manner, reading meta data out
+of ``meta`` tags, the title out of the ``title`` tag, and the body out of the 
+``body`` tag::
+
+    <html>
+        <head>
+            <title>My super title</title>
+            <meta name="tags" contents="thats, awesome" />
+            <meta name="date" contents="2012-07-09 22:28" />
+            <meta name="category" contents="yeah" />
+            <meta name="author" contents="Alexis Métaireau" />
+        </head>
+        <body>
+            This is the content of my super blog post.
+            <!-- PELICAN_END_SUMMARY -->
+            Content continues down here.
+        </body>
+    </html>
+
+With HTML, there are two simple exceptions to the standard metadata. First, 
+``tags`` can be specified either with the ``tags`` metadata, as is standard in 
+Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can 
+be used interchangeably. The second note is that summaries are done differently 
+in HTML posts. Either a ``summary`` metadata tag can be supplied, or, as seen 
+above, you can place an HTML comment, ``<!-- PELICAN_END_SUMMARY -->``, that 
+Pelican will recognize. Everything before the comment will be treated as a 
+summary. The content of the post will contain everything in the body tag, with 
+the special comment stripped out.
+
 Note that, aside from the title, none of this metadata is mandatory: if the date
 is not specified, Pelican will rely on the file's "mtime" timestamp, and the
 category can be determined by the directory in which the file resides. For
diff --git a/docs/internals.rst b/docs/internals.rst
index 6b6f991f..a94d1c56 100644
--- a/docs/internals.rst
+++ b/docs/internals.rst
@@ -23,7 +23,7 @@ The logic is separated into different classes and concepts:
   on. Since those operations are commonly used, the object is created once and
   then passed to the generators.
 
-* **Readers** are used to read from various formats (Markdown and
+* **Readers** are used to read from various formats (HTML, Markdown and
   reStructuredText for now, but the system is extensible). Given a file, they return
   metadata (author, tags, category, etc.) and content (HTML-formatted).
 

From 4ec6cefe1db92c0bc6cea9a95c810e3f5b455865 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Mon, 9 Jul 2012 22:45:34 -0400
Subject: [PATCH 10/19] fix grammar

---
 docs/getting_started.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index d60cce83..5e553815 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -154,7 +154,7 @@ Markdown posts should follow this pattern::
 
     This is the content of my super blog post.
 
-Lastly, you can use Vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican 
+Lastly, you can use vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican 
 interprets the HTML in a very straightforward manner, reading meta data out
 of ``meta`` tags, the title out of the ``title`` tag, and the body out of the 
 ``body`` tag::

From 357f3a3da211cffeda1501e1c8fb54dc069694f6 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Thu, 21 Jun 2012 09:05:27 -0400
Subject: [PATCH 11/19] properly write out charref's

---
 pelican/readers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pelican/readers.py b/pelican/readers.py
index 6fe8e894..de3df66f 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -235,7 +235,7 @@ class HTMLReader(Reader):
             self._data_buffer += '&{};'.format(data)
 
         def handle_charref(self, data):
-            self._data_buffer += '&{};'.format(data)
+            self._data_buffer += '&#{};'.format(data)
             
         def build_tag(self, tag, attrs, close_tag):
             result = '<{}'.format(cgi.escape(tag))

From 5f639b9a3b79213d5fd631216888af71990723c2 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Mon, 28 Jan 2013 21:46:54 -0500
Subject: [PATCH 12/19] git rebase master

---
 pelican/readers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pelican/readers.py b/pelican/readers.py
index de3df66f..60fabe82 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -257,8 +257,8 @@ class HTMLReader(Reader):
             return next((x[1] for x in attrs if x[0] == name), default)
 
     def read(self, filename):
-        """Parse content and metadata of markdown files"""
-        with pelican_open(filename) as content:
+        """Parse content and metadata of HTML files"""
+        with open(filename) as content:
             parser = self._HTMLParser(self.settings)
             parser.feed(content)
             parser.close()

From bf6f16e3839be680296c8325922606410bb86d8a Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Mon, 9 Jul 2012 22:43:51 -0400
Subject: [PATCH 13/19] add documentation for html reader

---
 docs/getting_started.rst | 36 ++++++++++++++++++++++++++++++++++++
 docs/internals.rst       |  4 ++--
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index 0952c7d9..7592a5ef 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -190,6 +190,42 @@ syntax for Markdown posts should follow this pattern::
 
     This is the content of my super blog post.
 
+Lastly, you can use Vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican 
+interprets the HTML in a very straightforward manner, reading meta data out
+of ``meta`` tags, the title out of the ``title`` tag, and the body out of the 
+``body`` tag::
+
+    <html>
+        <head>
+            <title>My super title</title>
+            <meta name="tags" contents="thats, awesome" />
+            <meta name="date" contents="2012-07-09 22:28" />
+            <meta name="category" contents="yeah" />
+            <meta name="author" contents="Alexis Métaireau" />
+        </head>
+        <body>
+            This is the content of my super blog post.
+            <!-- PELICAN_END_SUMMARY -->
+            Content continues down here.
+        </body>
+    </html>
+
+With HTML, there are two simple exceptions to the standard metadata. First, 
+``tags`` can be specified either with the ``tags`` metadata, as is standard in 
+Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can 
+be used interchangeably. The second note is that summaries are done differently 
+in HTML posts. Either a ``summary`` metadata tag can be supplied, or, as seen 
+above, you can place an HTML comment, ``<!-- PELICAN_END_SUMMARY -->``, that 
+Pelican will recognize. Everything before the comment will be treated as a 
+summary. The content of the post will contain everything in the body tag, with 
+the special comment stripped out.
+
+Note that, aside from the title, none of this metadata is mandatory: if the date
+is not specified, Pelican will rely on the file's "mtime" timestamp, and the
+category can be determined by the directory in which the file resides. For
+example, a file located at ``python/foobar/myfoobar.rst`` will have a category of
+``foobar``.
+
 Note that, aside from the title, none of this metadata is mandatory: if the
 date is not specified, Pelican can rely on the file's "mtime" timestamp through
 the ``DEFAULT_DATE`` setting, and the category can be determined by the
diff --git a/docs/internals.rst b/docs/internals.rst
index cadd300b..704122ba 100644
--- a/docs/internals.rst
+++ b/docs/internals.rst
@@ -23,8 +23,8 @@ The logic is separated into different classes and concepts:
   on. Since those operations are commonly used, the object is created once and
   then passed to the generators.
 
-* **Readers** are used to read from various formats (AsciiDoc, Markdown and
-  reStructuredText for now, but the system is extensible). Given a file, they
+* **Readers** are used to read from various formats (AsciiDoc, HTML, Markdown and
+  reStructuredText for now, but the system is extensible). Given a file, they 
   return metadata (author, tags, category, etc.) and content (HTML-formatted).
 
 * **Generators** generate the different outputs. For instance, Pelican comes with

From e6a4fe3fc40f003ad9ecba183f12a2fdc6a5adeb Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Mon, 9 Jul 2012 22:45:34 -0400
Subject: [PATCH 14/19] fix grammar

---
 docs/getting_started.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index 7592a5ef..c7f2e257 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -190,7 +190,7 @@ syntax for Markdown posts should follow this pattern::
 
     This is the content of my super blog post.
 
-Lastly, you can use Vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican 
+Lastly, you can use vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican 
 interprets the HTML in a very straightforward manner, reading meta data out
 of ``meta`` tags, the title out of the ``title`` tag, and the body out of the 
 ``body`` tag::

From 7b59b34a73560b3eeb6c737a5e8ce2e5b9c4c36b Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Mon, 28 Jan 2013 22:11:06 -0500
Subject: [PATCH 15/19] get tests passing

---
 pelican/readers.py                       | 10 ++++-----
 tests/content/article_with_comments.html |  7 ++++---
 tests/content/article_with_metadata.html |  2 +-
 tests/test_readers.py                    | 26 +++++++-----------------
 4 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/pelican/readers.py b/pelican/readers.py
index 60fabe82..9b8be192 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -223,10 +223,10 @@ class HTMLReader(Reader):
                 self._data_buffer += self.build_tag(tag, attrs, True)
 
         def handle_comment(self, data):
-            if self._in_body and data.strip() == 'PELICAN_END_SUMMARY':
-                self.metadata['summary'] = self._data_buffer
-            else:
-                self._data_buffer += '<!--{}-->'.format(data)
+        #    if self._in_body and data.strip() == 'PELICAN_END_SUMMARY':
+        #        self.metadata['summary'] = self._data_buffer
+        #    else:
+            self._data_buffer += '<!--{}-->'.format(data)
 
         def handle_data(self, data):
             self._data_buffer += data
@@ -258,7 +258,7 @@ class HTMLReader(Reader):
 
     def read(self, filename):
         """Parse content and metadata of HTML files"""
-        with open(filename) as content:
+        with pelican_open(filename) as content:
             parser = self._HTMLParser(self.settings)
             parser.feed(content)
             parser.close()
diff --git a/tests/content/article_with_comments.html b/tests/content/article_with_comments.html
index f222682d..289e4a66 100644
--- a/tests/content/article_with_comments.html
+++ b/tests/content/article_with_comments.html
@@ -1,7 +1,8 @@
 <html>
+    <head>
+    </head>
     <body>
-        Summary comment is not included.
-        <!-- PELICAN_END_SUMMARY -->
-        <!--  But this comment is (including extra whitespace)    -->
+        Body content
+        <!--  This comment is included (including extra whitespace)   -->
     </body>
 </html>
diff --git a/tests/content/article_with_metadata.html b/tests/content/article_with_metadata.html
index 2bd77241..b108ac8a 100644
--- a/tests/content/article_with_metadata.html
+++ b/tests/content/article_with_metadata.html
@@ -5,11 +5,11 @@
         <meta name="date" contents="2010-12-02 10:14" />
         <meta name="category" contents="yeah" />
         <meta name="author" contents="Alexis Métaireau" />
+        <meta name="summary" contents="Summary and stuff" />
         <meta name="custom_field" contents="http://notmyidea.org" />
     </head>
     <body>
         Multi-line metadata should be supported
         as well as <strong>inline markup</strong>.
-        <!-- PELICAN_END_SUMMARY -->
     </body>
 </html>
diff --git a/tests/test_readers.py b/tests/test_readers.py
index 8cee4c1a..49130669 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -264,25 +264,16 @@ class AdReaderTest(unittest.TestCase):
 class HTMLReaderTest(unittest.TestCase):
     def test_article_with_comments(self):
         reader = readers.HTMLReader({})
-        content, metadata = reader.read(_filename('article_with_comments.html'))
-        expected = {
-            'summary': '''
-        Summary comment is not included.
-        ''',
-        }
-
-        for key, value in expected.items():
-            self.assertEquals(value, metadata[key], key)
+        content, metadata = reader.read(_path('article_with_comments.html'))
 
         self.assertEquals('''
-        Summary comment is not included.
-        
-        <!--  But this comment is (including extra whitespace)    -->
+        Body content
+        <!--  This comment is included (including extra whitespace)   -->
     ''', content)
 
     def test_article_with_keywords(self):
         reader = readers.HTMLReader({})
-        content, metadata = reader.read(_filename('article_with_keywords.html'))
+        content, metadata = reader.read(_path('article_with_keywords.html'))
         expected = {
             'tags': ['foo', 'bar', 'foobar'],
         }
@@ -292,15 +283,12 @@ class HTMLReaderTest(unittest.TestCase):
 
     def test_article_with_metadata(self):
         reader = readers.HTMLReader({})
-        content, metadata = reader.read(_filename('article_with_metadata.html'))
+        content, metadata = reader.read(_path('article_with_metadata.html'))
         expected = {
             'category': 'yeah',
             'author': u'Alexis Métaireau',
             'title': 'This is a super article !',
-            'summary': u'''
-        Multi-line metadata should be supported
-        as well as <strong>inline markup</strong>.
-        ''',
+            'summary': u'''Summary and stuff''',
             'date': datetime.datetime(2010, 12, 2, 10, 14),
             'tags': ['foo', 'bar', 'foobar'],
             'custom_field': 'http://notmyidea.org',
@@ -313,6 +301,6 @@ class HTMLReaderTest(unittest.TestCase):
     def test_article_metadata_key_lowercase(self):
         """Keys of metadata should be lowercase."""
         reader = readers.HTMLReader({})
-        content, metadata = reader.read(_filename('article_with_uppercase_metadata.html'))
+        content, metadata = reader.read(_path('article_with_uppercase_metadata.html'))
         self.assertIn('category', metadata, "Key should be lowercase.")
         self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.")

From 2a3d7d031949e0b3ae693b8867350d3e40bb7f13 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Mon, 28 Jan 2013 22:21:45 -0500
Subject: [PATCH 16/19] fix python3 support

---
 pelican/readers.py    | 5 ++++-
 tests/test_readers.py | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/pelican/readers.py b/pelican/readers.py
index 9b8be192..8667a299 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -26,7 +26,10 @@ except ImportError:
 import re
 
 import cgi
-from HTMLParser import HTMLParser
+try:
+    from html.parser import HTMLParser
+except ImportError:
+    from HTMLParser import HTMLParser
 
 from pelican.contents import Category, Tag, Author
 from pelican.utils import get_date, pelican_open
diff --git a/tests/test_readers.py b/tests/test_readers.py
index 49130669..39bc2067 100644
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@@ -286,9 +286,9 @@ class HTMLReaderTest(unittest.TestCase):
         content, metadata = reader.read(_path('article_with_metadata.html'))
         expected = {
             'category': 'yeah',
-            'author': u'Alexis Métaireau',
+            'author': 'Alexis Métaireau',
             'title': 'This is a super article !',
-            'summary': u'''Summary and stuff''',
+            'summary': 'Summary and stuff',
             'date': datetime.datetime(2010, 12, 2, 10, 14),
             'tags': ['foo', 'bar', 'foobar'],
             'custom_field': 'http://notmyidea.org',

From d5bfec3a8b7ee4eef9f7359b93a8ce86cf2d701a Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Mon, 28 Jan 2013 22:25:15 -0500
Subject: [PATCH 17/19] update documentation and remove commented out code

---
 docs/getting_started.rst | 12 +++---------
 pelican/readers.py       |  3 ---
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index c7f2e257..7155efce 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -202,23 +202,17 @@ of ``meta`` tags, the title out of the ``title`` tag, and the body out of the
             <meta name="date" contents="2012-07-09 22:28" />
             <meta name="category" contents="yeah" />
             <meta name="author" contents="Alexis Métaireau" />
+            <meta name="summary" contents="Short version for index and feeds" />
         </head>
         <body>
             This is the content of my super blog post.
-            <!-- PELICAN_END_SUMMARY -->
-            Content continues down here.
         </body>
     </html>
 
-With HTML, there are two simple exceptions to the standard metadata. First, 
+With HTML, there is one simple exception to the standard metadata.
 ``tags`` can be specified either with the ``tags`` metadata, as is standard in 
 Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can 
-be used interchangeably. The second note is that summaries are done differently 
-in HTML posts. Either a ``summary`` metadata tag can be supplied, or, as seen 
-above, you can place an HTML comment, ``<!-- PELICAN_END_SUMMARY -->``, that 
-Pelican will recognize. Everything before the comment will be treated as a 
-summary. The content of the post will contain everything in the body tag, with 
-the special comment stripped out.
+be used interchangeably.
 
 Note that, aside from the title, none of this metadata is mandatory: if the date
 is not specified, Pelican will rely on the file's "mtime" timestamp, and the
diff --git a/pelican/readers.py b/pelican/readers.py
index 8667a299..ecb49f9c 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -226,9 +226,6 @@ class HTMLReader(Reader):
                 self._data_buffer += self.build_tag(tag, attrs, True)
 
         def handle_comment(self, data):
-        #    if self._in_body and data.strip() == 'PELICAN_END_SUMMARY':
-        #        self.metadata['summary'] = self._data_buffer
-        #    else:
             self._data_buffer += '<!--{}-->'.format(data)
 
         def handle_data(self, data):

From 8ba6a4d19d5d31d7b183332d812236b4b69d8dcd Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Sat, 9 Feb 2013 09:27:45 -0500
Subject: [PATCH 18/19] fix documentation

---
 docs/getting_started.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index 7155efce..4aeddbfb 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -215,10 +215,10 @@ Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can
 be used interchangeably.
 
 Note that, aside from the title, none of this metadata is mandatory: if the date
-is not specified, Pelican will rely on the file's "mtime" timestamp, and the
-category can be determined by the directory in which the file resides. For
-example, a file located at ``python/foobar/myfoobar.rst`` will have a category of
-``foobar``.
+is not specified and DEFAULT_DATE is None, Pelican will rely on the file's 
+"mtime" timestamp, and the category can be determined by the directory in which 
+the file resides. For example, a file located at ``python/foobar/myfoobar.rst`` 
+will have a category of ``foobar``.
 
 Note that, aside from the title, none of this metadata is mandatory: if the
 date is not specified, Pelican can rely on the file's "mtime" timestamp through

From 5f5b300ba5c2703c015ca5b0db540cb824cf2bd6 Mon Sep 17 00:00:00 2001
From: dave mankoff <mankyd@gmail.com>
Date: Sat, 9 Feb 2013 09:51:02 -0500
Subject: [PATCH 19/19] fix documentation

---
 docs/getting_started.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index 4aeddbfb..afea8c01 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -215,7 +215,7 @@ Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can
 be used interchangeably.
 
 Note that, aside from the title, none of this metadata is mandatory: if the date
-is not specified and DEFAULT_DATE is None, Pelican will rely on the file's 
+is not specified and DEFAULT_DATE is 'fs', Pelican will rely on the file's 
 "mtime" timestamp, and the category can be determined by the directory in which 
 the file resides. For example, a file located at ``python/foobar/myfoobar.rst`` 
 will have a category of ``foobar``.