new HTMLReader

This commit is contained in:
dave mankoff 2012-06-14 23:08:34 -04:00
commit cc1988fbda
5 changed files with 146 additions and 97 deletions

View file

@ -129,24 +129,12 @@ class MarkdownReader(Reader):
metadata[name] = self.process_metadata(name, value[0]) metadata[name] = self.process_metadata(name, value[0])
return content, metadata return content, metadata
""" class HTMLReader(Reader):
class HtmlReader(Reader): """Parses HTML files as input, looking for meta, title, and body tags"""
file_extensions = ['html', 'htm'] file_extensions = ['htm', 'html']
_re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>') enabled = True
def read(self, filename): class _HTMLParser(HTMLParser):
with open(filename) as content:
metadata = {'title': 'unnamed'}
for i in self._re.findall(content):
key = i.split(':')[0][5:].strip()
value = i.split(':')[-1][:-3].strip()
name = key.lower()
metadata[name] = self.process_metadata(name, value)
return content, metadata
"""
class PelicanHTMLParser(HTMLParser):
def __init__(self, settings): def __init__(self, settings):
HTMLParser.__init__(self) HTMLParser.__init__(self)
self.body = '' self.body = ''
@ -214,32 +202,28 @@ class PelicanHTMLParser(HTMLParser):
return result + '>' return result + '>'
def _handle_meta_tag(self, attrs): def _handle_meta_tag(self, attrs):
name = self._attr_value(attrs, 'name') name = self._attr_value(attrs, 'name').lower()
contents = self._attr_value(attrs, 'contents', '') contents = self._attr_value(attrs, 'contents', '')
if name == 'keywords': if name == 'keywords':
if contents: name = 'tags'
self.metadata['tags'] = [Tag(unicode(tag), self.settings) for tag in contents.split(',')]
elif name == 'date':
self.metadata['date'] = get_date(contents)
else:
self.metadata[name] = contents self.metadata[name] = contents
@classmethod @classmethod
def _attr_value(cls, attrs, name, default=None): def _attr_value(cls, attrs, name, default=None):
return next((x[1] for x in attrs if x[0] == name), default) return next((x[1] for x in attrs if x[0] == name), default)
class HTMLReader(Reader):
file_extensions = ['htm', 'html']
enabled = True
def read(self, filename): def read(self, filename):
"""Parse content and metadata of markdown files""" """Parse content and metadata of markdown files"""
with open(filename) as content: with open(filename) as content:
parser = PelicanHTMLParser(self.settings) parser = self._HTMLParser(self.settings)
parser.feed(content) parser.feed(content)
parser.close() parser.close()
return parser.body, parser.metadata
metadata = {}
for k in parser.metadata:
metadata[k] = self.process_metadata(k, parser.metadata[k])
return parser.body, metadata
_EXTENSIONS = {} _EXTENSIONS = {}

View file

@ -0,0 +1,6 @@
<html>
<head>
<title>This is a super article !</title>
<meta name="keywords" contents="foo, bar, foobar" />
</head>
</html>

View file

@ -0,0 +1,15 @@
<html>
<head>
<title>This is a super article !</title>
<meta name="tags" contents="foo, bar, foobar" />
<meta name="date" contents="2010-12-02 10:14" />
<meta name="category" contents="yeah" />
<meta name="author" contents="Alexis Métaireau" />
<meta name="custom_field" contents="http://notmyidea.org" />
</head>
<body>
Multi-line metadata should be supported
as well as <strong>inline markup</strong>.
<!-- PELICAN_END_SUMMARY -->
</body>
</html>

View file

@ -0,0 +1,6 @@
<html>
<head>
<title>This is a super article !</title>
<meta name="Category" contents="Yeah" />
</head>
</html>

View file

@ -86,3 +86,41 @@ class MdReaderTest(unittest.TestCase):
"<p>This is another markdown test file. Uses the mkd extension.</p>" "<p>This is another markdown test file. Uses the mkd extension.</p>"
self.assertEqual(content, expected) self.assertEqual(content, expected)
class HTMLReaderTest(unittest.TestCase):
def test_article_with_metadata(self):
reader = readers.HTMLReader({})
content, metadata = reader.read(_filename('article_with_metadata.html'))
expected = {
'category': 'yeah',
'author': u'Alexis Métaireau',
'title': 'This is a super article !',
'summary': u'''
Multi-line metadata should be supported
as well as <strong>inline markup</strong>.
''',
'date': datetime.datetime(2010, 12, 2, 10, 14),
'tags': ['foo', 'bar', 'foobar'],
'custom_field': 'http://notmyidea.org',
}
for key, value in expected.items():
self.assertEquals(value, metadata[key], key)
def test_article_with_keywords(self):
reader = readers.HTMLReader({})
content, metadata = reader.read(_filename('article_with_keywords.html'))
expected = {
'tags': ['foo', 'bar', 'foobar'],
}
for key, value in expected.items():
self.assertEquals(value, metadata[key], key)
def test_article_metadata_key_lowercase(self):
"""Keys of metadata should be lowercase."""
reader = readers.HTMLReader({})
content, metadata = reader.read(_filename('article_with_uppercase_metadata.html'))
self.assertIn('category', metadata, "Key should be lowercase.")
self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.")