forked from github/pelican
new HTMLReader
This commit is contained in:
parent
d93530a6ea
commit
cc1988fbda
5 changed files with 146 additions and 97 deletions
|
|
@ -129,117 +129,101 @@ class MarkdownReader(Reader):
|
|||
metadata[name] = self.process_metadata(name, value[0])
|
||||
return content, metadata
|
||||
|
||||
"""
|
||||
class HtmlReader(Reader):
|
||||
file_extensions = ['html', 'htm']
|
||||
_re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>')
|
||||
|
||||
def read(self, filename):
|
||||
with open(filename) as content:
|
||||
metadata = {'title': 'unnamed'}
|
||||
for i in self._re.findall(content):
|
||||
key = i.split(':')[0][5:].strip()
|
||||
value = i.split(':')[-1][:-3].strip()
|
||||
name = key.lower()
|
||||
metadata[name] = self.process_metadata(name, value)
|
||||
|
||||
return content, metadata
|
||||
"""
|
||||
|
||||
class PelicanHTMLParser(HTMLParser):
|
||||
def __init__(self, settings):
|
||||
HTMLParser.__init__(self)
|
||||
self.body = ''
|
||||
self.metadata = {}
|
||||
self.settings = settings
|
||||
|
||||
self._data_buffer = ''
|
||||
|
||||
self._in_top_level = True
|
||||
self._in_head = False
|
||||
self._in_title = False
|
||||
self._in_body = False
|
||||
self._in_tags = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'head' and self._in_top_level:
|
||||
self._in_top_level = False
|
||||
self._in_head = True
|
||||
elif tag == 'title' and self._in_head:
|
||||
self._in_title = True
|
||||
self._data_buffer = ''
|
||||
elif tag == 'body' and self._in_top_level:
|
||||
self._in_top_level = False
|
||||
self._in_body = True
|
||||
self._data_buffer = ''
|
||||
elif tag == 'meta' and self._in_head:
|
||||
self._handle_meta_tag(attrs)
|
||||
|
||||
elif self._in_body:
|
||||
self._data_buffer += self.build_tag(tag, attrs, False)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'head':
|
||||
if self._in_head:
|
||||
self._in_head = False
|
||||
self._in_top_level = True
|
||||
elif tag == 'title':
|
||||
self._in_title = False
|
||||
self.metadata['title'] = self._data_buffer
|
||||
elif tag == 'body':
|
||||
self.body = self._data_buffer
|
||||
self._in_body = False
|
||||
self._in_top_level = True
|
||||
elif self._in_body:
|
||||
self._data_buffer += '</{}>'.format(cgi.escape(tag))
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
if tag == 'meta' and self._in_head:
|
||||
self._handle_meta_tag(attrs)
|
||||
if self._in_body:
|
||||
self._data_buffer += self.build_tag(tag, attrs, True)
|
||||
|
||||
def handle_comment(self, data):
|
||||
if self._in_body and data.strip() == 'PELICAN_END_SUMMARY':
|
||||
self.metadata['summary'] = self._data_buffer
|
||||
|
||||
def handle_data(self, data):
|
||||
self._data_buffer += data
|
||||
|
||||
def build_tag(self, tag, attrs, close_tag):
|
||||
result = '<{}'.format(cgi.escape(tag))
|
||||
result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs))
|
||||
if close_tag:
|
||||
return result + ' />'
|
||||
return result + '>'
|
||||
|
||||
def _handle_meta_tag(self, attrs):
|
||||
name = self._attr_value(attrs, 'name')
|
||||
contents = self._attr_value(attrs, 'contents', '')
|
||||
if name == 'keywords':
|
||||
if contents:
|
||||
self.metadata['tags'] = [Tag(unicode(tag), self.settings) for tag in contents.split(',')]
|
||||
elif name == 'date':
|
||||
self.metadata['date'] = get_date(contents)
|
||||
else:
|
||||
self.metadata[name] = contents
|
||||
|
||||
@classmethod
|
||||
def _attr_value(cls, attrs, name, default=None):
|
||||
return next((x[1] for x in attrs if x[0] == name), default)
|
||||
|
||||
class HTMLReader(Reader):
|
||||
"""Parses HTML files as input, looking for meta, title, and body tags"""
|
||||
file_extensions = ['htm', 'html']
|
||||
enabled = True
|
||||
|
||||
class _HTMLParser(HTMLParser):
|
||||
def __init__(self, settings):
|
||||
HTMLParser.__init__(self)
|
||||
self.body = ''
|
||||
self.metadata = {}
|
||||
self.settings = settings
|
||||
|
||||
self._data_buffer = ''
|
||||
|
||||
self._in_top_level = True
|
||||
self._in_head = False
|
||||
self._in_title = False
|
||||
self._in_body = False
|
||||
self._in_tags = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'head' and self._in_top_level:
|
||||
self._in_top_level = False
|
||||
self._in_head = True
|
||||
elif tag == 'title' and self._in_head:
|
||||
self._in_title = True
|
||||
self._data_buffer = ''
|
||||
elif tag == 'body' and self._in_top_level:
|
||||
self._in_top_level = False
|
||||
self._in_body = True
|
||||
self._data_buffer = ''
|
||||
elif tag == 'meta' and self._in_head:
|
||||
self._handle_meta_tag(attrs)
|
||||
|
||||
elif self._in_body:
|
||||
self._data_buffer += self.build_tag(tag, attrs, False)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'head':
|
||||
if self._in_head:
|
||||
self._in_head = False
|
||||
self._in_top_level = True
|
||||
elif tag == 'title':
|
||||
self._in_title = False
|
||||
self.metadata['title'] = self._data_buffer
|
||||
elif tag == 'body':
|
||||
self.body = self._data_buffer
|
||||
self._in_body = False
|
||||
self._in_top_level = True
|
||||
elif self._in_body:
|
||||
self._data_buffer += '</{}>'.format(cgi.escape(tag))
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
if tag == 'meta' and self._in_head:
|
||||
self._handle_meta_tag(attrs)
|
||||
if self._in_body:
|
||||
self._data_buffer += self.build_tag(tag, attrs, True)
|
||||
|
||||
def handle_comment(self, data):
|
||||
if self._in_body and data.strip() == 'PELICAN_END_SUMMARY':
|
||||
self.metadata['summary'] = self._data_buffer
|
||||
|
||||
def handle_data(self, data):
|
||||
self._data_buffer += data
|
||||
|
||||
def build_tag(self, tag, attrs, close_tag):
|
||||
result = '<{}'.format(cgi.escape(tag))
|
||||
result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs))
|
||||
if close_tag:
|
||||
return result + ' />'
|
||||
return result + '>'
|
||||
|
||||
def _handle_meta_tag(self, attrs):
|
||||
name = self._attr_value(attrs, 'name').lower()
|
||||
contents = self._attr_value(attrs, 'contents', '')
|
||||
|
||||
if name == 'keywords':
|
||||
name = 'tags'
|
||||
self.metadata[name] = contents
|
||||
|
||||
@classmethod
|
||||
def _attr_value(cls, attrs, name, default=None):
|
||||
return next((x[1] for x in attrs if x[0] == name), default)
|
||||
|
||||
def read(self, filename):
|
||||
"""Parse content and metadata of markdown files"""
|
||||
with open(filename) as content:
|
||||
parser = PelicanHTMLParser(self.settings)
|
||||
parser = self._HTMLParser(self.settings)
|
||||
parser.feed(content)
|
||||
parser.close()
|
||||
return parser.body, parser.metadata
|
||||
|
||||
metadata = {}
|
||||
for k in parser.metadata:
|
||||
metadata[k] = self.process_metadata(k, parser.metadata[k])
|
||||
return parser.body, metadata
|
||||
|
||||
_EXTENSIONS = {}
|
||||
|
||||
|
|
|
|||
6
tests/content/article_with_keywords.html
Normal file
6
tests/content/article_with_keywords.html
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>This is a super article !</title>
|
||||
<meta name="keywords" contents="foo, bar, foobar" />
|
||||
</head>
|
||||
</html>
|
||||
15
tests/content/article_with_metadata.html
Normal file
15
tests/content/article_with_metadata.html
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>This is a super article !</title>
|
||||
<meta name="tags" contents="foo, bar, foobar" />
|
||||
<meta name="date" contents="2010-12-02 10:14" />
|
||||
<meta name="category" contents="yeah" />
|
||||
<meta name="author" contents="Alexis Métaireau" />
|
||||
<meta name="custom_field" contents="http://notmyidea.org" />
|
||||
</head>
|
||||
<body>
|
||||
Multi-line metadata should be supported
|
||||
as well as <strong>inline markup</strong>.
|
||||
<!-- PELICAN_END_SUMMARY -->
|
||||
</body>
|
||||
</html>
|
||||
6
tests/content/article_with_uppercase_metadata.html
Normal file
6
tests/content/article_with_uppercase_metadata.html
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>This is a super article !</title>
|
||||
<meta name="Category" contents="Yeah" />
|
||||
</head>
|
||||
</html>
|
||||
|
|
@ -86,3 +86,41 @@ class MdReaderTest(unittest.TestCase):
|
|||
"<p>This is another markdown test file. Uses the mkd extension.</p>"
|
||||
|
||||
self.assertEqual(content, expected)
|
||||
|
||||
class HTMLReaderTest(unittest.TestCase):
|
||||
|
||||
def test_article_with_metadata(self):
|
||||
reader = readers.HTMLReader({})
|
||||
content, metadata = reader.read(_filename('article_with_metadata.html'))
|
||||
expected = {
|
||||
'category': 'yeah',
|
||||
'author': u'Alexis Métaireau',
|
||||
'title': 'This is a super article !',
|
||||
'summary': u'''
|
||||
Multi-line metadata should be supported
|
||||
as well as <strong>inline markup</strong>.
|
||||
''',
|
||||
'date': datetime.datetime(2010, 12, 2, 10, 14),
|
||||
'tags': ['foo', 'bar', 'foobar'],
|
||||
'custom_field': 'http://notmyidea.org',
|
||||
}
|
||||
|
||||
for key, value in expected.items():
|
||||
self.assertEquals(value, metadata[key], key)
|
||||
|
||||
def test_article_with_keywords(self):
|
||||
reader = readers.HTMLReader({})
|
||||
content, metadata = reader.read(_filename('article_with_keywords.html'))
|
||||
expected = {
|
||||
'tags': ['foo', 'bar', 'foobar'],
|
||||
}
|
||||
|
||||
for key, value in expected.items():
|
||||
self.assertEquals(value, metadata[key], key)
|
||||
|
||||
def test_article_metadata_key_lowercase(self):
|
||||
"""Keys of metadata should be lowercase."""
|
||||
reader = readers.HTMLReader({})
|
||||
content, metadata = reader.read(_filename('article_with_uppercase_metadata.html'))
|
||||
self.assertIn('category', metadata, "Key should be lowercase.")
|
||||
self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue