Merge pull request #382 from mankyd/htmlparser

New, more thorough HTMLParser
2025-10-15 20:28:56 +02:00 · 2013-02-09 16:48:50 -08:00 · 2013-02-09 16:48:50 -08:00 · f3bc2ece86
commit f3bc2ece86
parent 06899aa826 5f5b300ba5
9 changed files with 227 additions and 22 deletions
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@ -190,6 +190,36 @@ syntax for Markdown posts should follow this pattern::

    This is the content of my super blog post.

+Lastly, you can use vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican 
+interprets the HTML in a very straightforward manner, reading meta data out
+of ``meta`` tags, the title out of the ``title`` tag, and the body out of the 
+``body`` tag::
+
+    <html>
+        <head>
+            <title>My super title</title>
+            <meta name="tags" contents="thats, awesome" />
+            <meta name="date" contents="2012-07-09 22:28" />
+            <meta name="category" contents="yeah" />
+            <meta name="author" contents="Alexis Métaireau" />
+            <meta name="summary" contents="Short version for index and feeds" />
+        </head>
+        <body>
+            This is the content of my super blog post.
+        </body>
+    </html>
+
+With HTML, there is one simple exception to the standard metadata.
+``tags`` can be specified either with the ``tags`` metadata, as is standard in 
+Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can 
+be used interchangeably.
+
+Note that, aside from the title, none of this metadata is mandatory: if the date
+is not specified and DEFAULT_DATE is 'fs', Pelican will rely on the file's 
+"mtime" timestamp, and the category can be determined by the directory in which 
+the file resides. For example, a file located at ``python/foobar/myfoobar.rst`` 
+will have a category of ``foobar``.
+
 Note that, aside from the title, none of this metadata is mandatory: if the
 date is not specified, Pelican can rely on the file's "mtime" timestamp through
 the ``DEFAULT_DATE`` setting, and the category can be determined by the
--- a/docs/internals.rst
+++ b/docs/internals.rst
@ -23,8 +23,8 @@ The logic is separated into different classes and concepts:
  on. Since those operations are commonly used, the object is created once and
  then passed to the generators.

-* **Readers** are used to read from various formats (AsciiDoc, Markdown and
-  reStructuredText for now, but the system is extensible). Given a file, they
+* **Readers** are used to read from various formats (AsciiDoc, HTML, Markdown and
+  reStructuredText for now, but the system is extensible). Given a file, they 
  return metadata (author, tags, category, etc.) and content (HTML-formatted).

 * **Generators** generate the different outputs. For instance, Pelican comes with
--- a/pelican/readers.py
+++ b/pelican/readers.py
@ -25,6 +25,12 @@ except ImportError:
    asciidoc = False
 import re

+import cgi
+try:
+    from html.parser import HTMLParser
+except ImportError:
+    from HTMLParser import HTMLParser
+
 from pelican.contents import Category, Tag, Author
 from pelican.utils import get_date, pelican_open

@ -154,30 +160,114 @@ class MarkdownReader(Reader):

    def read(self, source_path):
        """Parse content and metadata of markdown files"""
-        text = pelican_open(source_path)
-        md = Markdown(extensions=set(self.extensions + ['meta']))
-        content = md.convert(text)
+
+        with pelican_open(source_path) as text:
+            md = Markdown(extensions=set(self.extensions + ['meta']))
+            content = md.convert(text)

        metadata = self._parse_metadata(md.Meta)
        return content, metadata

+class HTMLReader(Reader):
+    """Parses HTML files as input, looking for meta, title, and body tags"""
+    file_extensions = ['htm', 'html']
+    enabled = True

-class HtmlReader(Reader):
-    file_extensions = ['html', 'htm']
-    _re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>')
+    class _HTMLParser(HTMLParser):
+        def __init__(self, settings):
+            HTMLParser.__init__(self)
+            self.body = ''
+            self.metadata = {}
+            self.settings = settings

-    def read(self, source_path):
-        """Parse content and metadata of (x)HTML files"""
-        with open(source_path) as content:
-            metadata = {'title': 'unnamed'}
-            for i in self._re.findall(content):
-                key = i.split(':')[0][5:].strip()
-                value = i.split(':')[-1][:-3].strip()
-                name = key.lower()
-                metadata[name] = self.process_metadata(name, value)
+            self._data_buffer = ''

-            return content, metadata
+            self._in_top_level = True
+            self._in_head = False
+            self._in_title = False
+            self._in_body = False
+            self._in_tags = False

+        def handle_starttag(self, tag, attrs):
+            if tag == 'head' and self._in_top_level:
+                self._in_top_level = False
+                self._in_head = True
+            elif tag == 'title' and self._in_head:
+                self._in_title = True
+                self._data_buffer = ''
+            elif tag == 'body' and self._in_top_level:
+                self._in_top_level = False
+                self._in_body = True
+                self._data_buffer = ''
+            elif tag == 'meta' and self._in_head:
+                self._handle_meta_tag(attrs)
+
+            elif self._in_body:
+                self._data_buffer += self.build_tag(tag, attrs, False)
+
+        def handle_endtag(self, tag):
+            if tag == 'head':
+                if self._in_head:
+                    self._in_head = False
+                    self._in_top_level = True
+            elif tag == 'title':
+                self._in_title = False
+                self.metadata['title'] = self._data_buffer
+            elif tag == 'body':
+                self.body = self._data_buffer
+                self._in_body = False
+                self._in_top_level = True
+            elif self._in_body:
+                self._data_buffer += '</{}>'.format(cgi.escape(tag))
+
+        def handle_startendtag(self, tag, attrs):
+            if tag == 'meta' and self._in_head:
+                self._handle_meta_tag(attrs)
+            if self._in_body:
+                self._data_buffer += self.build_tag(tag, attrs, True)
+
+        def handle_comment(self, data):
+            self._data_buffer += '<!--{}-->'.format(data)
+
+        def handle_data(self, data):
+            self._data_buffer += data
+
+        def handle_entityref(self, data):
+            self._data_buffer += '&{};'.format(data)
+
+        def handle_charref(self, data):
+            self._data_buffer += '&#{};'.format(data)
+            
+        def build_tag(self, tag, attrs, close_tag):
+            result = '<{}'.format(cgi.escape(tag))
+            result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs))
+            if close_tag:
+                return result + ' />'
+            return result + '>'
+
+        def _handle_meta_tag(self, attrs):
+            name = self._attr_value(attrs, 'name').lower()
+            contents = self._attr_value(attrs, 'contents', '')
+
+            if name == 'keywords':
+                name = 'tags'
+            self.metadata[name] = contents
+
+        @classmethod
+        def _attr_value(cls, attrs, name, default=None):
+            return next((x[1] for x in attrs if x[0] == name), default)
+
+    def read(self, filename):
+        """Parse content and metadata of HTML files"""
+        with pelican_open(filename) as content:
+            parser = self._HTMLParser(self.settings)
+            parser.feed(content)
+            parser.close()
+
+        metadata = {}
+        for k in parser.metadata:
+            metadata[k] = self.process_metadata(k, parser.metadata[k])
+        return parser.body, metadata

 class AsciiDocReader(Reader):
    enabled = bool(asciidoc)
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -185,10 +185,16 @@ def get_date(string):
    raise ValueError("'%s' is not a valid date" % string)


-def pelican_open(path):
+class pelican_open(object):
    """Open a file and return it's content"""
-    return open(path, encoding='utf-8').read()
+    def __init__(self, filename):
+        self.filename = filename

+    def __enter__(self):
+        return open(self.filename, encoding='utf-8').read()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass

 def slugify(value):
    """
--- a/tests/content/article_with_comments.html
+++ b/tests/content/article_with_comments.html
@ -0,0 +1,8 @@
+<html>
+    <head>
+    </head>
+    <body>
+        Body content
+        <!--  This comment is included (including extra whitespace)   -->
+    </body>
+</html>
--- a/tests/content/article_with_keywords.html
+++ b/tests/content/article_with_keywords.html
@ -0,0 +1,6 @@
+<html>
+    <head>
+        <title>This is a super article !</title>
+        <meta name="keywords" contents="foo, bar, foobar" />
+    </head>
+</html>
--- a/tests/content/article_with_metadata.html
+++ b/tests/content/article_with_metadata.html
@ -0,0 +1,15 @@
+<html>
+    <head>
+        <title>This is a super article !</title>
+        <meta name="tags" contents="foo, bar, foobar" />
+        <meta name="date" contents="2010-12-02 10:14" />
+        <meta name="category" contents="yeah" />
+        <meta name="author" contents="Alexis Métaireau" />
+        <meta name="summary" contents="Summary and stuff" />
+        <meta name="custom_field" contents="http://notmyidea.org" />
+    </head>
+    <body>
+        Multi-line metadata should be supported
+        as well as <strong>inline markup</strong>.
+    </body>
+</html>
--- a/tests/content/article_with_uppercase_metadata.html
+++ b/tests/content/article_with_uppercase_metadata.html
@ -0,0 +1,6 @@
+<html>
+    <head>
+        <title>This is a super article !</title>
+        <meta name="Category" contents="Yeah" />
+    </head>
+</html>
--- a/tests/test_readers.py
+++ b/tests/test_readers.py
@ -260,3 +260,47 @@ class AdReaderTest(unittest.TestCase):
                   '<p>version 1.0.42</p>\n'\
                   '<p>The quick brown fox jumped over the lazy dog&#8217;s back.</p>\n'
        self.assertEqual(content, expected)
+
+class HTMLReaderTest(unittest.TestCase):
+    def test_article_with_comments(self):
+        reader = readers.HTMLReader({})
+        content, metadata = reader.read(_path('article_with_comments.html'))
+
+        self.assertEquals('''
+        Body content
+        <!--  This comment is included (including extra whitespace)   -->
+    ''', content)
+
+    def test_article_with_keywords(self):
+        reader = readers.HTMLReader({})
+        content, metadata = reader.read(_path('article_with_keywords.html'))
+        expected = {
+            'tags': ['foo', 'bar', 'foobar'],
+        }
+
+        for key, value in expected.items():
+            self.assertEquals(value, metadata[key], key)
+
+    def test_article_with_metadata(self):
+        reader = readers.HTMLReader({})
+        content, metadata = reader.read(_path('article_with_metadata.html'))
+        expected = {
+            'category': 'yeah',
+            'author': 'Alexis Métaireau',
+            'title': 'This is a super article !',
+            'summary': 'Summary and stuff',
+            'date': datetime.datetime(2010, 12, 2, 10, 14),
+            'tags': ['foo', 'bar', 'foobar'],
+            'custom_field': 'http://notmyidea.org',
+        }
+
+        for key, value in expected.items():
+            self.assertEquals(value, metadata[key], key)
+
+
+    def test_article_metadata_key_lowercase(self):
+        """Keys of metadata should be lowercase."""
+        reader = readers.HTMLReader({})
+        content, metadata = reader.read(_path('article_with_uppercase_metadata.html'))
+        self.assertIn('category', metadata, "Key should be lowercase.")
+        self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.")