Fix quote escaping in read html attributes.

* Wrap HTML attributes in quotes according to their content. If it contains a double quote use single quotes, otherwise escape with double quotes. * Add escape_html utility to ensure quote entities are converted identically across Python versions. Fixes #1260
2015-10-12 20:31:32 +00:00 · 2015-10-12 20:31:32 +00:00 · d333ed12c6
commit d333ed12c6
parent 661ee49eda
5 changed files with 47 additions and 11 deletions
--- a/pelican/readers.py
+++ b/pelican/readers.py
@ -18,18 +18,14 @@ from pelican import rstdirectives  # NOQA
 from pelican import signals
 from pelican.cache import FileStampDataCacher
 from pelican.contents import Author, Category, Page, Tag
-from pelican.utils import SafeDatetime, get_date, pelican_open, posixize_path
+from pelican.utils import SafeDatetime, escape_html, get_date, pelican_open, \
+    posixize_path

 try:
    from markdown import Markdown
 except ImportError:
    Markdown = False  # NOQA

-try:
-    from html import escape
-except ImportError:
-    from cgi import escape
-
 # Metadata processors have no way to discard an unwanted value, so we have
 # them return this value instead to signal that it should be discarded later.
 # This means that _filter_discardable_metadata() must be called on processed
@ -354,7 +350,7 @@ class HTMLReader(BaseReader):
                self._in_body = False
                self._in_top_level = True
            elif self._in_body:
-                self._data_buffer += '</{}>'.format(escape(tag))
+                self._data_buffer += '</{}>'.format(escape_html(tag))

        def handle_startendtag(self, tag, attrs):
            if tag == 'meta' and self._in_head:
@ -375,11 +371,16 @@ class HTMLReader(BaseReader):
            self._data_buffer += '&#{};'.format(data)

        def build_tag(self, tag, attrs, close_tag):
-            result = '<{}'.format(escape(tag))
+            result = '<{}'.format(escape_html(tag))
            for k, v in attrs:
-                result += ' ' + escape(k)
+                result += ' ' + escape_html(k)
                if v is not None:
-                    result += '="{}"'.format(escape(v))
+                    # If the attribute value contains a double quote, surround
+                    # with single quotes, otherwise use double quotes.
+                    if '"' in v:
+                        result += "='{}'".format(escape_html(v, quote=False))
+                    else:
+                        result += '="{}"'.format(escape_html(v, quote=False))
            if close_tag:
                return result + ' />'
            return result + '>'
--- a/pelican/tests/content/article_with_attributes_containing_double_quotes.html
+++ b/pelican/tests/content/article_with_attributes_containing_double_quotes.html
@ -0,0 +1,11 @@
+<html>
+    <head>
+    </head>
+    <body>
+        Ensure that if an attribute value contains a double quote, it is
+        surrounded with single quotes, otherwise with double quotes.
+        <span data-test="'single quoted string'">Span content</span>
+        <span data-test='"double quoted string"'>Span content</span>
+        <span data-test="string without quotes">Span content</span>
+    </body>
+</html>
--- a/pelican/tests/test_cache.py
+++ b/pelican/tests/test_cache.py
@ -61,7 +61,7 @@ class TestCache(unittest.TestCase):
        - article_with_null_attributes.html
        - 2012-11-30_md_w_filename_meta#foo-bar.md
        """
-        self.assertEqual(generator.readers.read_file.call_count, 3)
+        self.assertEqual(generator.readers.read_file.call_count, 4)

    @unittest.skipUnless(MagicMock, 'Needs Mock module')
    def test_article_reader_content_caching(self):
--- a/pelican/tests/test_readers.py
+++ b/pelican/tests/test_readers.py
@ -587,6 +587,17 @@ class HTMLReaderTest(ReaderTest):
        <input name="test" disabled style="" />
    ''', page.content)

+    def test_article_with_attributes_containing_double_quotes(self):
+        page = self.read_file(path='article_with_attributes_containing_' +
+                                   'double_quotes.html')
+        self.assertEqual('''
+        Ensure that if an attribute value contains a double quote, it is
+        surrounded with single quotes, otherwise with double quotes.
+        <span data-test="'single quoted string'">Span content</span>
+        <span data-test='"double quoted string"'>Span content</span>
+        <span data-test="string without quotes">Span content</span>
+    ''', page.content)
+
    def test_article_metadata_key_lowercase(self):
        # Keys of metadata should be lowercase.
        page = self.read_file(path='article_with_uppercase_metadata.html')
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -28,6 +28,11 @@ import six
 from six.moves import html_entities
 from six.moves.html_parser import HTMLParser

+try:
+    from html import escape
+except ImportError:
+    from cgi import escape
+
 logger = logging.getLogger(__name__)


@ -548,6 +553,14 @@ def truncate_html_words(s, num, end_text='...'):
    return out


+def escape_html(text, quote=True):
+    """Escape '&', '<' and '>' to HTML-safe sequences.
+
+    In Python 2 this uses cgi.escape and in Python 3 this uses html.escape. We
+    wrap here to ensure the quote argument has an identical default."""
+    return escape(text, quote=quote)
+
+
 def process_translations(content_list, order_by=None):
    """ Finds translation and returns them.