Merge pull request #1151 from malept/html-reader-non-conforming-meta-tags

Log a warning when the HTML reader encounters a nonconformant meta tag
This commit is contained in:
Justin Mayer 2013-11-09 14:04:38 -08:00
commit c8f5eb54db
4 changed files with 29 additions and 1 deletions

View file

@ -302,7 +302,12 @@ class HTMLReader(BaseReader):
return result + '>'
def _handle_meta_tag(self, attrs):
name = self._attr_value(attrs, 'name').lower()
name = self._attr_value(attrs, 'name')
if name is None:
attr_serialized = ', '.join(['{}="{}"'.format(k, v) for k, v in attrs])
logger.warning("Meta tag in file %s does not have a 'name' attribute, skipping. Attributes: %s", self._filename, attr_serialized)
return
name = name.lower()
contents = self._attr_value(attrs, 'content', '')
if not contents:
contents = self._attr_value(attrs, 'contents', '')

View file

@ -0,0 +1,12 @@
<html>
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<title>Article with Nonconformant HTML meta tags</title>
<meta name="summary" content="Summary and stuff" />
</head>
<body>
Multi-line metadata should be supported
as well as <strong>inline markup</strong>.
</body>
</html>

View file

@ -86,6 +86,7 @@ class TestArticlesGenerator(unittest.TestCase):
['Test mkd File', 'published', 'test', 'article'],
['This is a super article !', 'published', 'Yeah', 'article'],
['This is a super article !', 'published', 'Yeah', 'article'],
['Article with Nonconformant HTML meta tags', 'published', 'Default', 'article'],
['This is a super article !', 'published', 'yeah', 'article'],
['This is a super article !', 'published', 'yeah', 'article'],
['This is a super article !', 'published', 'yeah', 'article'],

View file

@ -384,3 +384,13 @@ class HTMLReaderTest(ReaderTest):
self.assertIn('category', page.metadata, 'Key should be lowercase.')
self.assertEqual('Yeah', page.metadata.get('category'),
'Value keeps cases.')
def test_article_with_nonconformant_meta_tags(self):
page = self.read_file(path='article_with_nonconformant_meta_tags.html')
expected = {
'summary': 'Summary and stuff',
'title': 'Article with Nonconformant HTML meta tags',
}
for key, value in expected.items():
self.assertEqual(value, page.metadata[key], key)