Fix quote escaping in read html attributes.

* Wrap HTML attributes in quotes according to their content. If it contains a double quote use single quotes, otherwise escape with double quotes. * Add escape_html utility to ensure quote entities are converted identically across Python versions. Fixes #1260
2015-10-12 20:31:32 +00:00 · 2015-10-12 20:31:32 +00:00 · d333ed12c6
commit d333ed12c6
parent 661ee49eda
5 changed files with 47 additions and 11 deletions
--- a/pelican/readers.py
+++ b/pelican/readers.py
@ -18,18 +18,14 @@ from pelican import rstdirectives  # NOQA
 from pelican import signals
 from pelican.cache import FileStampDataCacher
 from pelican.contents import Author, Category, Page, Tag
-from pelican.utils import SafeDatetime, get_date, pelican_open, posixize_path
+from pelican.utils import SafeDatetime, escape_html, get_date, pelican_open, \
    posixize_path
 try:
    from markdown import Markdown
 except ImportError:
    Markdown = False  # NOQA
 try:
    from html import escape
 except ImportError:
    from cgi import escape
 # Metadata processors have no way to discard an unwanted value, so we have
 # them return this value instead to signal that it should be discarded later.
 # This means that _filter_discardable_metadata() must be called on processed
@ -354,7 +350,7 @@ class HTMLReader(BaseReader):
                self._in_body = False
                self._in_top_level = True
            elif self._in_body:
-                self._data_buffer += '</{}>'.format(escape(tag))
+                self._data_buffer += '</{}>'.format(escape_html(tag))
        def handle_startendtag(self, tag, attrs):
            if tag == 'meta' and self._in_head:
@ -375,11 +371,16 @@ class HTMLReader(BaseReader):
            self._data_buffer += '&#{};'.format(data)
        def build_tag(self, tag, attrs, close_tag):
-            result = '<{}'.format(escape(tag))
+            result = '<{}'.format(escape_html(tag))
            for k, v in attrs:
-                result += ' ' + escape(k)
+                result += ' ' + escape_html(k)
                if v is not None:
-                    result += '="{}"'.format(escape(v))
+                    # If the attribute value contains a double quote, surround
                    # with single quotes, otherwise use double quotes.
                    if '"' in v:
                        result += "='{}'".format(escape_html(v, quote=False))
                    else:
                        result += '="{}"'.format(escape_html(v, quote=False))
            if close_tag:
                return result + ' />'
            return result + '>'
--- a/pelican/tests/content/article_with_attributes_containing_double_quotes.html
+++ b/pelican/tests/content/article_with_attributes_containing_double_quotes.html
@ -0,0 +1,11 @@
 <html>
    <head>
    </head>
    <body>
        Ensure that if an attribute value contains a double quote, it is
        surrounded with single quotes, otherwise with double quotes.
        <span data-test="'single quoted string'">Span content</span>
        <span data-test='"double quoted string"'>Span content</span>
        <span data-test="string without quotes">Span content</span>
    </body>
 </html>
--- a/pelican/tests/test_cache.py
+++ b/pelican/tests/test_cache.py
@ -61,7 +61,7 @@ class TestCache(unittest.TestCase):
        - article_with_null_attributes.html
        - 2012-11-30_md_w_filename_meta#foo-bar.md
        """
-        self.assertEqual(generator.readers.read_file.call_count, 3)
+        self.assertEqual(generator.readers.read_file.call_count, 4)
    @unittest.skipUnless(MagicMock, 'Needs Mock module')
    def test_article_reader_content_caching(self):
--- a/pelican/tests/test_readers.py
+++ b/pelican/tests/test_readers.py
@ -587,6 +587,17 @@ class HTMLReaderTest(ReaderTest):
        <input name="test" disabled style="" />
    ''', page.content)
    def test_article_with_attributes_containing_double_quotes(self):
        page = self.read_file(path='article_with_attributes_containing_' +
                                   'double_quotes.html')
        self.assertEqual('''
        Ensure that if an attribute value contains a double quote, it is
        surrounded with single quotes, otherwise with double quotes.
        <span data-test="'single quoted string'">Span content</span>
        <span data-test='"double quoted string"'>Span content</span>
        <span data-test="string without quotes">Span content</span>
    ''', page.content)
    def test_article_metadata_key_lowercase(self):
        # Keys of metadata should be lowercase.
        page = self.read_file(path='article_with_uppercase_metadata.html')
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -28,6 +28,11 @@ import six
 from six.moves import html_entities
 from six.moves.html_parser import HTMLParser
 try:
    from html import escape
 except ImportError:
    from cgi import escape
 logger = logging.getLogger(__name__)
@ -548,6 +553,14 @@ def truncate_html_words(s, num, end_text='...'):
    return out
 def escape_html(text, quote=True):
    """Escape '&', '<' and '>' to HTML-safe sequences.
    In Python 2 this uses cgi.escape and in Python 3 this uses html.escape. We
    wrap here to ensure the quote argument has an identical default."""
    return escape(text, quote=quote)
 def process_translations(content_list, order_by=None):
    """ Finds translation and returns them.