From b573576b006152cceb8182911d888e0de424156a Mon Sep 17 00:00:00 2001 From: Andrea Corbellini Date: Thu, 8 Feb 2018 18:30:09 +0100 Subject: [PATCH 1/3] Fix utils.truncate_html_words() to work with invalid HTML references Invalid references like those missing semicolons (e.g. `&mdash`) or those causing overflows (e.g. `�`) are now gracefully handled and no exception is thrown. This commit also adds tests and comments where needed. --- pelican/tests/test_utils.py | 8 +++++ pelican/utils.py | 67 +++++++++++++++++++++++++++++++------ 2 files changed, 65 insertions(+), 10 deletions(-) diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 9a7109d6..634dfbee 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -217,6 +217,14 @@ class TestUtils(LoggedTestCase): utils.truncate_html_words("∫dx " * 100, 20), "∫dx " * 20 + '…') + # Words with invalid or broken HTML references. + self.assertEqual(utils.truncate_html_words('&invalid;', 20), '&invalid;') + self.assertEqual(utils.truncate_html_words('�', 20), '�') + self.assertEqual(utils.truncate_html_words('�', 20), '�') + self.assertEqual(utils.truncate_html_words('&mdash', 20), '&mdash') + self.assertEqual(utils.truncate_html_words('Ӓ', 20), 'Ӓ') + self.assertEqual(utils.truncate_html_words('઼', 20), '઼') + def test_process_translations(self): fr_articles = [] en_articles = [] diff --git a/pelican/utils.py b/pelican/utils.py index ef9da23b..59400699 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -550,9 +550,41 @@ class _HTMLWordTruncator(HTMLParser): if word_end < len(data): self.add_last_word() - def handle_ref(self, char): + def _handle_ref(self, name, char): + """ + Called by handle_entityref() or handle_charref() when a ref like + `—`, `—`, or `—` is found. + + The arguments for this method are: + + - `name`: the HTML entity name (such as `mdash` or `#8212` or `#x2014`) + - `char`: the Unicode representation of the ref (such as `—`) + + This method checks whether the entity is considered to be part of a + word or not and, if not, signals the end of a word. + """ + # Compute the index of the character right after the ref. + # + # In a string like 'prefix—suffix', the end is the sum of: + # + # - `self.getoffset()` (the length of `prefix`) + # - `1` (the length of `&`) + # - `len(name)` (the length of `mdash`) + # - `1` (the length of `;`) + # - `1` (required to go to the start of `suffix`) + # + # Note that, in case of malformed HTML, the ';' character may + # not be present. + offset = self.getoffset() - ref_end = self.rawdata.index(';', offset) + 1 + ref_end = offset + len(name) + 1 + 1 + + try: + if self.rawdata[ref_end] == ';': + ref_end += 1 + except IndexError: + # We are at the end of the string and there's no ';' + pass if self.last_word_end is None: if self._word_prefix_regex.match(char): @@ -564,19 +596,34 @@ class _HTMLWordTruncator(HTMLParser): self.add_last_word() def handle_entityref(self, name): + """ + Called when an entity ref like '—' is found + + `name` is the entity ref without ampersand and semicolon (e.g. `mdash`) + """ try: codepoint = html_entities.name2codepoint[name] + char = six.unichr(codepoint) except KeyError: - self.handle_ref('') - else: - self.handle_ref(six.unichr(codepoint)) + char = '' + self._handle_ref(name, char) def handle_charref(self, name): - if name.startswith('x'): - codepoint = int(name[1:], 16) - else: - codepoint = int(name) - self.handle_ref(six.unichr(codepoint)) + """ + Called when a char ref like '—' or '—' is found + + `name` is the char ref without ampersand and semicolon (e.g. `#8212` or + `#x2014`) + """ + try: + if name.startswith('x'): + codepoint = int(name[1:], 16) + else: + codepoint = int(name) + char = six.unichr(codepoint) + except (ValueError, OverflowError): + char = '' + self._handle_ref(name, char) def truncate_html_words(s, num, end_text='…'): From fc7af9e1c3f08093c78f493b4903a63f8e0ca3a9 Mon Sep 17 00:00:00 2001 From: Andrea Corbellini Date: Thu, 8 Feb 2018 18:39:29 +0100 Subject: [PATCH 2/3] flake8 fixes --- pelican/tests/test_utils.py | 18 ++++++++++++------ pelican/utils.py | 8 ++++---- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 634dfbee..3863ba32 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -218,12 +218,18 @@ class TestUtils(LoggedTestCase): "∫dx " * 20 + '…') # Words with invalid or broken HTML references. - self.assertEqual(utils.truncate_html_words('&invalid;', 20), '&invalid;') - self.assertEqual(utils.truncate_html_words('�', 20), '�') - self.assertEqual(utils.truncate_html_words('�', 20), '�') - self.assertEqual(utils.truncate_html_words('&mdash', 20), '&mdash') - self.assertEqual(utils.truncate_html_words('Ӓ', 20), 'Ӓ') - self.assertEqual(utils.truncate_html_words('઼', 20), '઼') + self.assertEqual( + utils.truncate_html_words('&invalid;', 20), '&invalid;') + self.assertEqual( + utils.truncate_html_words('�', 20), '�') + self.assertEqual( + utils.truncate_html_words('�', 20), '�') + self.assertEqual( + utils.truncate_html_words('&mdash', 20), '&mdash') + self.assertEqual( + utils.truncate_html_words('Ӓ', 20), 'Ӓ') + self.assertEqual( + utils.truncate_html_words('઼', 20), '઼') def test_process_translations(self): fr_articles = [] diff --git a/pelican/utils.py b/pelican/utils.py index 59400699..ab2e4a6e 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -552,15 +552,15 @@ class _HTMLWordTruncator(HTMLParser): def _handle_ref(self, name, char): """ - Called by handle_entityref() or handle_charref() when a ref like + Called by handle_entityref() or handle_charref() when a ref like `—`, `—`, or `—` is found. - + The arguments for this method are: - + - `name`: the HTML entity name (such as `mdash` or `#8212` or `#x2014`) - `char`: the Unicode representation of the ref (such as `—`) - This method checks whether the entity is considered to be part of a + This method checks whether the entity is considered to be part of a word or not and, if not, signals the end of a word. """ # Compute the index of the character right after the ref. From 01480a539f0a8b631d3c0a9a41ddfbdf9e854d6e Mon Sep 17 00:00:00 2001 From: Andrea Corbellini Date: Thu, 8 Feb 2018 20:10:08 +0100 Subject: [PATCH 3/3] more accurate code and tests --- pelican/tests/test_utils.py | 6 +++--- pelican/utils.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 3863ba32..b5b8b454 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -225,11 +225,11 @@ class TestUtils(LoggedTestCase): self.assertEqual( utils.truncate_html_words('�', 20), '�') self.assertEqual( - utils.truncate_html_words('&mdash', 20), '&mdash') + utils.truncate_html_words('&mdash text', 20), '&mdash text') self.assertEqual( - utils.truncate_html_words('Ӓ', 20), 'Ӓ') + utils.truncate_html_words('Ӓ text', 20), 'Ӓ text') self.assertEqual( - utils.truncate_html_words('઼', 20), '઼') + utils.truncate_html_words('઼ text', 20), '઼ text') def test_process_translations(self): fr_articles = [] diff --git a/pelican/utils.py b/pelican/utils.py index ab2e4a6e..f9c5eb3f 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -571,13 +571,12 @@ class _HTMLWordTruncator(HTMLParser): # - `1` (the length of `&`) # - `len(name)` (the length of `mdash`) # - `1` (the length of `;`) - # - `1` (required to go to the start of `suffix`) # # Note that, in case of malformed HTML, the ';' character may # not be present. offset = self.getoffset() - ref_end = offset + len(name) + 1 + 1 + ref_end = offset + len(name) + 1 try: if self.rawdata[ref_end] == ';': @@ -623,7 +622,7 @@ class _HTMLWordTruncator(HTMLParser): char = six.unichr(codepoint) except (ValueError, OverflowError): char = '' - self._handle_ref(name, char) + self._handle_ref('#' + name, char) def truncate_html_words(s, num, end_text='…'):