1
0
Fork 0
forked from github/pelican

Fix quote escaping in read html attributes.

* Wrap HTML attributes in quotes according to their content.  If it contains a double quote use single quotes, otherwise escape with double quotes.
* Add escape_html utility to ensure quote entities are converted identically across Python versions.

Fixes #1260
This commit is contained in:
Simon StJG 2015-10-12 20:31:32 +00:00
commit d333ed12c6
5 changed files with 47 additions and 11 deletions

View file

@ -18,18 +18,14 @@ from pelican import rstdirectives # NOQA
from pelican import signals from pelican import signals
from pelican.cache import FileStampDataCacher from pelican.cache import FileStampDataCacher
from pelican.contents import Author, Category, Page, Tag from pelican.contents import Author, Category, Page, Tag
from pelican.utils import SafeDatetime, get_date, pelican_open, posixize_path from pelican.utils import SafeDatetime, escape_html, get_date, pelican_open, \
posixize_path
try: try:
from markdown import Markdown from markdown import Markdown
except ImportError: except ImportError:
Markdown = False # NOQA Markdown = False # NOQA
try:
from html import escape
except ImportError:
from cgi import escape
# Metadata processors have no way to discard an unwanted value, so we have # Metadata processors have no way to discard an unwanted value, so we have
# them return this value instead to signal that it should be discarded later. # them return this value instead to signal that it should be discarded later.
# This means that _filter_discardable_metadata() must be called on processed # This means that _filter_discardable_metadata() must be called on processed
@ -354,7 +350,7 @@ class HTMLReader(BaseReader):
self._in_body = False self._in_body = False
self._in_top_level = True self._in_top_level = True
elif self._in_body: elif self._in_body:
self._data_buffer += '</{}>'.format(escape(tag)) self._data_buffer += '</{}>'.format(escape_html(tag))
def handle_startendtag(self, tag, attrs): def handle_startendtag(self, tag, attrs):
if tag == 'meta' and self._in_head: if tag == 'meta' and self._in_head:
@ -375,11 +371,16 @@ class HTMLReader(BaseReader):
self._data_buffer += '&#{};'.format(data) self._data_buffer += '&#{};'.format(data)
def build_tag(self, tag, attrs, close_tag): def build_tag(self, tag, attrs, close_tag):
result = '<{}'.format(escape(tag)) result = '<{}'.format(escape_html(tag))
for k, v in attrs: for k, v in attrs:
result += ' ' + escape(k) result += ' ' + escape_html(k)
if v is not None: if v is not None:
result += '="{}"'.format(escape(v)) # If the attribute value contains a double quote, surround
# with single quotes, otherwise use double quotes.
if '"' in v:
result += "='{}'".format(escape_html(v, quote=False))
else:
result += '="{}"'.format(escape_html(v, quote=False))
if close_tag: if close_tag:
return result + ' />' return result + ' />'
return result + '>' return result + '>'

View file

@ -0,0 +1,11 @@
<html>
<head>
</head>
<body>
Ensure that if an attribute value contains a double quote, it is
surrounded with single quotes, otherwise with double quotes.
<span data-test="'single quoted string'">Span content</span>
<span data-test='"double quoted string"'>Span content</span>
<span data-test="string without quotes">Span content</span>
</body>
</html>

View file

@ -61,7 +61,7 @@ class TestCache(unittest.TestCase):
- article_with_null_attributes.html - article_with_null_attributes.html
- 2012-11-30_md_w_filename_meta#foo-bar.md - 2012-11-30_md_w_filename_meta#foo-bar.md
""" """
self.assertEqual(generator.readers.read_file.call_count, 3) self.assertEqual(generator.readers.read_file.call_count, 4)
@unittest.skipUnless(MagicMock, 'Needs Mock module') @unittest.skipUnless(MagicMock, 'Needs Mock module')
def test_article_reader_content_caching(self): def test_article_reader_content_caching(self):

View file

@ -587,6 +587,17 @@ class HTMLReaderTest(ReaderTest):
<input name="test" disabled style="" /> <input name="test" disabled style="" />
''', page.content) ''', page.content)
def test_article_with_attributes_containing_double_quotes(self):
page = self.read_file(path='article_with_attributes_containing_' +
'double_quotes.html')
self.assertEqual('''
Ensure that if an attribute value contains a double quote, it is
surrounded with single quotes, otherwise with double quotes.
<span data-test="'single quoted string'">Span content</span>
<span data-test='"double quoted string"'>Span content</span>
<span data-test="string without quotes">Span content</span>
''', page.content)
def test_article_metadata_key_lowercase(self): def test_article_metadata_key_lowercase(self):
# Keys of metadata should be lowercase. # Keys of metadata should be lowercase.
page = self.read_file(path='article_with_uppercase_metadata.html') page = self.read_file(path='article_with_uppercase_metadata.html')

View file

@ -28,6 +28,11 @@ import six
from six.moves import html_entities from six.moves import html_entities
from six.moves.html_parser import HTMLParser from six.moves.html_parser import HTMLParser
try:
from html import escape
except ImportError:
from cgi import escape
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -548,6 +553,14 @@ def truncate_html_words(s, num, end_text='...'):
return out return out
def escape_html(text, quote=True):
"""Escape '&', '<' and '>' to HTML-safe sequences.
In Python 2 this uses cgi.escape and in Python 3 this uses html.escape. We
wrap here to ensure the quote argument has an identical default."""
return escape(text, quote=quote)
def process_translations(content_list, order_by=None): def process_translations(content_list, order_by=None):
""" Finds translation and returns them. """ Finds translation and returns them.