Fix quote escaping in read html attributes.

* Wrap HTML attributes in quotes according to their content.  If it contains a double quote use single quotes, otherwise escape with double quotes.
* Add escape_html utility to ensure quote entities are converted identically across Python versions.

Fixes #1260
This commit is contained in:
Simon StJG 2015-10-12 20:31:32 +00:00
commit d333ed12c6
5 changed files with 47 additions and 11 deletions

View file

@ -587,6 +587,17 @@ class HTMLReaderTest(ReaderTest):
<input name="test" disabled style="" />
''', page.content)
def test_article_with_attributes_containing_double_quotes(self):
page = self.read_file(path='article_with_attributes_containing_' +
'double_quotes.html')
self.assertEqual('''
Ensure that if an attribute value contains a double quote, it is
surrounded with single quotes, otherwise with double quotes.
<span data-test="'single quoted string'">Span content</span>
<span data-test='"double quoted string"'>Span content</span>
<span data-test="string without quotes">Span content</span>
''', page.content)
def test_article_metadata_key_lowercase(self):
# Keys of metadata should be lowercase.
page = self.read_file(path='article_with_uppercase_metadata.html')