From b573576b006152cceb8182911d888e0de424156a Mon Sep 17 00:00:00 2001
From: Andrea Corbellini <corbellini.andrea@gmail.com>
Date: Thu, 8 Feb 2018 18:30:09 +0100
Subject: [PATCH 1/3] Fix utils.truncate_html_words() to work with invalid HTML
 references

Invalid references like those missing semicolons (e.g. `&mdash`) or
those causing overflows (e.g. `&#9999999999;`) are now gracefully
handled and no exception is thrown.

This commit also adds tests and comments where needed.
---
 pelican/tests/test_utils.py |  8 +++++
 pelican/utils.py            | 67 +++++++++++++++++++++++++++++++------
 2 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py
index 9a7109d6..634dfbee 100644
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@@ -217,6 +217,14 @@ class TestUtils(LoggedTestCase):
             utils.truncate_html_words("&#x222b;dx " * 100, 20),
             "&#x222b;dx " * 20 + '…')
 
+        # Words with invalid or broken HTML references.
+        self.assertEqual(utils.truncate_html_words('&invalid;', 20), '&invalid;')
+        self.assertEqual(utils.truncate_html_words('&#9999999999;', 20), '&#9999999999;')
+        self.assertEqual(utils.truncate_html_words('&#xfffffffff;', 20), '&#xfffffffff;')
+        self.assertEqual(utils.truncate_html_words('&mdash', 20), '&mdash')
+        self.assertEqual(utils.truncate_html_words('&#1234', 20), '&#1234')
+        self.assertEqual(utils.truncate_html_words('&#xabc', 20), '&#xabc')
+
     def test_process_translations(self):
         fr_articles = []
         en_articles = []
diff --git a/pelican/utils.py b/pelican/utils.py
index ef9da23b..59400699 100644
--- a/pelican/utils.py
+++ b/pelican/utils.py
@@ -550,9 +550,41 @@ class _HTMLWordTruncator(HTMLParser):
         if word_end < len(data):
             self.add_last_word()
 
-    def handle_ref(self, char):
+    def _handle_ref(self, name, char):
+        """
+        Called by handle_entityref() or handle_charref() when a ref like 
+        `&mdash;`, `&#8212;`, or `&#x2014` is found.
+        
+        The arguments for this method are:
+        
+        - `name`: the HTML entity name (such as `mdash` or `#8212` or `#x2014`)
+        - `char`: the Unicode representation of the ref (such as `—`)
+
+        This method checks whether the entity is considered to be part of a 
+        word or not and, if not, signals the end of a word.
+        """
+        # Compute the index of the character right after the ref.
+        #
+        # In a string like 'prefix&mdash;suffix', the end is the sum of:
+        #
+        # - `self.getoffset()` (the length of `prefix`)
+        # - `1` (the length of `&`)
+        # - `len(name)` (the length of `mdash`)
+        # - `1` (the length of `;`)
+        # - `1` (required to go to the start of `suffix`)
+        #
+        # Note that, in case of malformed HTML, the ';' character may
+        # not be present.
+
         offset = self.getoffset()
-        ref_end = self.rawdata.index(';', offset) + 1
+        ref_end = offset + len(name) + 1 + 1
+
+        try:
+            if self.rawdata[ref_end] == ';':
+                ref_end += 1
+        except IndexError:
+            # We are at the end of the string and there's no ';'
+            pass
 
         if self.last_word_end is None:
             if self._word_prefix_regex.match(char):
@@ -564,19 +596,34 @@ class _HTMLWordTruncator(HTMLParser):
                 self.add_last_word()
 
     def handle_entityref(self, name):
+        """
+        Called when an entity ref like '&mdash;' is found
+
+        `name` is the entity ref without ampersand and semicolon (e.g. `mdash`)
+        """
         try:
             codepoint = html_entities.name2codepoint[name]
+            char = six.unichr(codepoint)
         except KeyError:
-            self.handle_ref('')
-        else:
-            self.handle_ref(six.unichr(codepoint))
+            char = ''
+        self._handle_ref(name, char)
 
     def handle_charref(self, name):
-        if name.startswith('x'):
-            codepoint = int(name[1:], 16)
-        else:
-            codepoint = int(name)
-        self.handle_ref(six.unichr(codepoint))
+        """
+        Called when a char ref like '&#8212;' or '&#x2014' is found
+
+        `name` is the char ref without ampersand and semicolon (e.g. `#8212` or
+        `#x2014`)
+        """
+        try:
+            if name.startswith('x'):
+                codepoint = int(name[1:], 16)
+            else:
+                codepoint = int(name)
+            char = six.unichr(codepoint)
+        except (ValueError, OverflowError):
+            char = ''
+        self._handle_ref(name, char)
 
 
 def truncate_html_words(s, num, end_text='…'):

From fc7af9e1c3f08093c78f493b4903a63f8e0ca3a9 Mon Sep 17 00:00:00 2001
From: Andrea Corbellini <corbellini.andrea@gmail.com>
Date: Thu, 8 Feb 2018 18:39:29 +0100
Subject: [PATCH 2/3] flake8 fixes

---
 pelican/tests/test_utils.py | 18 ++++++++++++------
 pelican/utils.py            |  8 ++++----
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py
index 634dfbee..3863ba32 100644
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@@ -218,12 +218,18 @@ class TestUtils(LoggedTestCase):
             "&#x222b;dx " * 20 + '…')
 
         # Words with invalid or broken HTML references.
-        self.assertEqual(utils.truncate_html_words('&invalid;', 20), '&invalid;')
-        self.assertEqual(utils.truncate_html_words('&#9999999999;', 20), '&#9999999999;')
-        self.assertEqual(utils.truncate_html_words('&#xfffffffff;', 20), '&#xfffffffff;')
-        self.assertEqual(utils.truncate_html_words('&mdash', 20), '&mdash')
-        self.assertEqual(utils.truncate_html_words('&#1234', 20), '&#1234')
-        self.assertEqual(utils.truncate_html_words('&#xabc', 20), '&#xabc')
+        self.assertEqual(
+            utils.truncate_html_words('&invalid;', 20), '&invalid;')
+        self.assertEqual(
+            utils.truncate_html_words('&#9999999999;', 20), '&#9999999999;')
+        self.assertEqual(
+            utils.truncate_html_words('&#xfffffffff;', 20), '&#xfffffffff;')
+        self.assertEqual(
+            utils.truncate_html_words('&mdash', 20), '&mdash')
+        self.assertEqual(
+            utils.truncate_html_words('&#1234', 20), '&#1234')
+        self.assertEqual(
+            utils.truncate_html_words('&#xabc', 20), '&#xabc')
 
     def test_process_translations(self):
         fr_articles = []
diff --git a/pelican/utils.py b/pelican/utils.py
index 59400699..ab2e4a6e 100644
--- a/pelican/utils.py
+++ b/pelican/utils.py
@@ -552,15 +552,15 @@ class _HTMLWordTruncator(HTMLParser):
 
     def _handle_ref(self, name, char):
         """
-        Called by handle_entityref() or handle_charref() when a ref like 
+        Called by handle_entityref() or handle_charref() when a ref like
         `&mdash;`, `&#8212;`, or `&#x2014` is found.
-        
+
         The arguments for this method are:
-        
+
         - `name`: the HTML entity name (such as `mdash` or `#8212` or `#x2014`)
         - `char`: the Unicode representation of the ref (such as `—`)
 
-        This method checks whether the entity is considered to be part of a 
+        This method checks whether the entity is considered to be part of a
         word or not and, if not, signals the end of a word.
         """
         # Compute the index of the character right after the ref.

From 01480a539f0a8b631d3c0a9a41ddfbdf9e854d6e Mon Sep 17 00:00:00 2001
From: Andrea Corbellini <corbellini.andrea@gmail.com>
Date: Thu, 8 Feb 2018 20:10:08 +0100
Subject: [PATCH 3/3] more accurate code and tests

---
 pelican/tests/test_utils.py | 6 +++---
 pelican/utils.py            | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py
index 3863ba32..b5b8b454 100644
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@@ -225,11 +225,11 @@ class TestUtils(LoggedTestCase):
         self.assertEqual(
             utils.truncate_html_words('&#xfffffffff;', 20), '&#xfffffffff;')
         self.assertEqual(
-            utils.truncate_html_words('&mdash', 20), '&mdash')
+            utils.truncate_html_words('&mdash text', 20), '&mdash text')
         self.assertEqual(
-            utils.truncate_html_words('&#1234', 20), '&#1234')
+            utils.truncate_html_words('&#1234 text', 20), '&#1234 text')
         self.assertEqual(
-            utils.truncate_html_words('&#xabc', 20), '&#xabc')
+            utils.truncate_html_words('&#xabc text', 20), '&#xabc text')
 
     def test_process_translations(self):
         fr_articles = []
diff --git a/pelican/utils.py b/pelican/utils.py
index ab2e4a6e..f9c5eb3f 100644
--- a/pelican/utils.py
+++ b/pelican/utils.py
@@ -571,13 +571,12 @@ class _HTMLWordTruncator(HTMLParser):
         # - `1` (the length of `&`)
         # - `len(name)` (the length of `mdash`)
         # - `1` (the length of `;`)
-        # - `1` (required to go to the start of `suffix`)
         #
         # Note that, in case of malformed HTML, the ';' character may
         # not be present.
 
         offset = self.getoffset()
-        ref_end = offset + len(name) + 1 + 1
+        ref_end = offset + len(name) + 1
 
         try:
             if self.rawdata[ref_end] == ';':
@@ -623,7 +622,7 @@ class _HTMLWordTruncator(HTMLParser):
             char = six.unichr(codepoint)
         except (ValueError, OverflowError):
             char = ''
-        self._handle_ref(name, char)
+        self._handle_ref('#' + name, char)
 
 
 def truncate_html_words(s, num, end_text='…'):