Merge pull request #382 from mankyd/htmlparser

New, more thorough HTMLParser
This commit is contained in:
Justin Mayer 2013-02-09 16:48:50 -08:00
commit f3bc2ece86
9 changed files with 227 additions and 22 deletions

View file

@ -25,6 +25,12 @@ except ImportError:
asciidoc = False
import re
import cgi
try:
from html.parser import HTMLParser
except ImportError:
from HTMLParser import HTMLParser
from pelican.contents import Category, Tag, Author
from pelican.utils import get_date, pelican_open
@ -154,30 +160,114 @@ class MarkdownReader(Reader):
def read(self, source_path):
"""Parse content and metadata of markdown files"""
text = pelican_open(source_path)
md = Markdown(extensions=set(self.extensions + ['meta']))
content = md.convert(text)
with pelican_open(source_path) as text:
md = Markdown(extensions=set(self.extensions + ['meta']))
content = md.convert(text)
metadata = self._parse_metadata(md.Meta)
return content, metadata
class HTMLReader(Reader):
"""Parses HTML files as input, looking for meta, title, and body tags"""
file_extensions = ['htm', 'html']
enabled = True
class HtmlReader(Reader):
file_extensions = ['html', 'htm']
_re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>')
class _HTMLParser(HTMLParser):
def __init__(self, settings):
HTMLParser.__init__(self)
self.body = ''
self.metadata = {}
self.settings = settings
def read(self, source_path):
"""Parse content and metadata of (x)HTML files"""
with open(source_path) as content:
metadata = {'title': 'unnamed'}
for i in self._re.findall(content):
key = i.split(':')[0][5:].strip()
value = i.split(':')[-1][:-3].strip()
name = key.lower()
metadata[name] = self.process_metadata(name, value)
self._data_buffer = ''
return content, metadata
self._in_top_level = True
self._in_head = False
self._in_title = False
self._in_body = False
self._in_tags = False
def handle_starttag(self, tag, attrs):
if tag == 'head' and self._in_top_level:
self._in_top_level = False
self._in_head = True
elif tag == 'title' and self._in_head:
self._in_title = True
self._data_buffer = ''
elif tag == 'body' and self._in_top_level:
self._in_top_level = False
self._in_body = True
self._data_buffer = ''
elif tag == 'meta' and self._in_head:
self._handle_meta_tag(attrs)
elif self._in_body:
self._data_buffer += self.build_tag(tag, attrs, False)
def handle_endtag(self, tag):
if tag == 'head':
if self._in_head:
self._in_head = False
self._in_top_level = True
elif tag == 'title':
self._in_title = False
self.metadata['title'] = self._data_buffer
elif tag == 'body':
self.body = self._data_buffer
self._in_body = False
self._in_top_level = True
elif self._in_body:
self._data_buffer += '</{}>'.format(cgi.escape(tag))
def handle_startendtag(self, tag, attrs):
if tag == 'meta' and self._in_head:
self._handle_meta_tag(attrs)
if self._in_body:
self._data_buffer += self.build_tag(tag, attrs, True)
def handle_comment(self, data):
self._data_buffer += '<!--{}-->'.format(data)
def handle_data(self, data):
self._data_buffer += data
def handle_entityref(self, data):
self._data_buffer += '&{};'.format(data)
def handle_charref(self, data):
self._data_buffer += '&#{};'.format(data)
def build_tag(self, tag, attrs, close_tag):
result = '<{}'.format(cgi.escape(tag))
result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs))
if close_tag:
return result + ' />'
return result + '>'
def _handle_meta_tag(self, attrs):
name = self._attr_value(attrs, 'name').lower()
contents = self._attr_value(attrs, 'contents', '')
if name == 'keywords':
name = 'tags'
self.metadata[name] = contents
@classmethod
def _attr_value(cls, attrs, name, default=None):
return next((x[1] for x in attrs if x[0] == name), default)
def read(self, filename):
"""Parse content and metadata of HTML files"""
with pelican_open(filename) as content:
parser = self._HTMLParser(self.settings)
parser.feed(content)
parser.close()
metadata = {}
for k in parser.metadata:
metadata[k] = self.process_metadata(k, parser.metadata[k])
return parser.body, metadata
class AsciiDocReader(Reader):
enabled = bool(asciidoc)