1
0
Fork 0
forked from github/pelican

Fix quote escaping in read html attributes.

* Wrap HTML attributes in quotes according to their content.  If it contains a double quote use single quotes, otherwise escape with double quotes.
* Add escape_html utility to ensure quote entities are converted identically across Python versions.

Fixes #1260
This commit is contained in:
Simon StJG 2015-10-12 20:31:32 +00:00
commit d333ed12c6
5 changed files with 47 additions and 11 deletions

View file

@ -18,18 +18,14 @@ from pelican import rstdirectives # NOQA
from pelican import signals
from pelican.cache import FileStampDataCacher
from pelican.contents import Author, Category, Page, Tag
from pelican.utils import SafeDatetime, get_date, pelican_open, posixize_path
from pelican.utils import SafeDatetime, escape_html, get_date, pelican_open, \
posixize_path
try:
from markdown import Markdown
except ImportError:
Markdown = False # NOQA
try:
from html import escape
except ImportError:
from cgi import escape
# Metadata processors have no way to discard an unwanted value, so we have
# them return this value instead to signal that it should be discarded later.
# This means that _filter_discardable_metadata() must be called on processed
@ -354,7 +350,7 @@ class HTMLReader(BaseReader):
self._in_body = False
self._in_top_level = True
elif self._in_body:
self._data_buffer += '</{}>'.format(escape(tag))
self._data_buffer += '</{}>'.format(escape_html(tag))
def handle_startendtag(self, tag, attrs):
if tag == 'meta' and self._in_head:
@ -375,11 +371,16 @@ class HTMLReader(BaseReader):
self._data_buffer += '&#{};'.format(data)
def build_tag(self, tag, attrs, close_tag):
result = '<{}'.format(escape(tag))
result = '<{}'.format(escape_html(tag))
for k, v in attrs:
result += ' ' + escape(k)
result += ' ' + escape_html(k)
if v is not None:
result += '="{}"'.format(escape(v))
# If the attribute value contains a double quote, surround
# with single quotes, otherwise use double quotes.
if '"' in v:
result += "='{}'".format(escape_html(v, quote=False))
else:
result += '="{}"'.format(escape_html(v, quote=False))
if close_tag:
return result + ' />'
return result + '>'

View file

@ -0,0 +1,11 @@
<html>
<head>
</head>
<body>
Ensure that if an attribute value contains a double quote, it is
surrounded with single quotes, otherwise with double quotes.
<span data-test="'single quoted string'">Span content</span>
<span data-test='"double quoted string"'>Span content</span>
<span data-test="string without quotes">Span content</span>
</body>
</html>

View file

@ -61,7 +61,7 @@ class TestCache(unittest.TestCase):
- article_with_null_attributes.html
- 2012-11-30_md_w_filename_meta#foo-bar.md
"""
self.assertEqual(generator.readers.read_file.call_count, 3)
self.assertEqual(generator.readers.read_file.call_count, 4)
@unittest.skipUnless(MagicMock, 'Needs Mock module')
def test_article_reader_content_caching(self):

View file

@ -587,6 +587,17 @@ class HTMLReaderTest(ReaderTest):
<input name="test" disabled style="" />
''', page.content)
def test_article_with_attributes_containing_double_quotes(self):
page = self.read_file(path='article_with_attributes_containing_' +
'double_quotes.html')
self.assertEqual('''
Ensure that if an attribute value contains a double quote, it is
surrounded with single quotes, otherwise with double quotes.
<span data-test="'single quoted string'">Span content</span>
<span data-test='"double quoted string"'>Span content</span>
<span data-test="string without quotes">Span content</span>
''', page.content)
def test_article_metadata_key_lowercase(self):
# Keys of metadata should be lowercase.
page = self.read_file(path='article_with_uppercase_metadata.html')

View file

@ -28,6 +28,11 @@ import six
from six.moves import html_entities
from six.moves.html_parser import HTMLParser
try:
from html import escape
except ImportError:
from cgi import escape
logger = logging.getLogger(__name__)
@ -548,6 +553,14 @@ def truncate_html_words(s, num, end_text='...'):
return out
def escape_html(text, quote=True):
"""Escape '&', '<' and '>' to HTML-safe sequences.
In Python 2 this uses cgi.escape and in Python 3 this uses html.escape. We
wrap here to ensure the quote argument has an identical default."""
return escape(text, quote=quote)
def process_translations(content_list, order_by=None):
""" Finds translation and returns them.