mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
Merge pull request #2285 from andreacorbellini/2263-handle-invalid-refs
Fix utils.truncate_html_words() to work with invalid HTML references
This commit is contained in:
commit
b4d5e4285e
2 changed files with 70 additions and 10 deletions
|
|
@ -217,6 +217,20 @@ class TestUtils(LoggedTestCase):
|
||||||
utils.truncate_html_words("∫dx " * 100, 20),
|
utils.truncate_html_words("∫dx " * 100, 20),
|
||||||
"∫dx " * 20 + '…')
|
"∫dx " * 20 + '…')
|
||||||
|
|
||||||
|
# Words with invalid or broken HTML references.
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words('&invalid;', 20), '&invalid;')
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words('�', 20), '�')
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words('�', 20), '�')
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words('&mdash text', 20), '&mdash text')
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words('Ӓ text', 20), 'Ӓ text')
|
||||||
|
self.assertEqual(
|
||||||
|
utils.truncate_html_words('઼ text', 20), '઼ text')
|
||||||
|
|
||||||
def test_process_translations(self):
|
def test_process_translations(self):
|
||||||
fr_articles = []
|
fr_articles = []
|
||||||
en_articles = []
|
en_articles = []
|
||||||
|
|
|
||||||
|
|
@ -550,9 +550,40 @@ class _HTMLWordTruncator(HTMLParser):
|
||||||
if word_end < len(data):
|
if word_end < len(data):
|
||||||
self.add_last_word()
|
self.add_last_word()
|
||||||
|
|
||||||
def handle_ref(self, char):
|
def _handle_ref(self, name, char):
|
||||||
|
"""
|
||||||
|
Called by handle_entityref() or handle_charref() when a ref like
|
||||||
|
`—`, `—`, or `—` is found.
|
||||||
|
|
||||||
|
The arguments for this method are:
|
||||||
|
|
||||||
|
- `name`: the HTML entity name (such as `mdash` or `#8212` or `#x2014`)
|
||||||
|
- `char`: the Unicode representation of the ref (such as `—`)
|
||||||
|
|
||||||
|
This method checks whether the entity is considered to be part of a
|
||||||
|
word or not and, if not, signals the end of a word.
|
||||||
|
"""
|
||||||
|
# Compute the index of the character right after the ref.
|
||||||
|
#
|
||||||
|
# In a string like 'prefix—suffix', the end is the sum of:
|
||||||
|
#
|
||||||
|
# - `self.getoffset()` (the length of `prefix`)
|
||||||
|
# - `1` (the length of `&`)
|
||||||
|
# - `len(name)` (the length of `mdash`)
|
||||||
|
# - `1` (the length of `;`)
|
||||||
|
#
|
||||||
|
# Note that, in case of malformed HTML, the ';' character may
|
||||||
|
# not be present.
|
||||||
|
|
||||||
offset = self.getoffset()
|
offset = self.getoffset()
|
||||||
ref_end = self.rawdata.index(';', offset) + 1
|
ref_end = offset + len(name) + 1
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.rawdata[ref_end] == ';':
|
||||||
|
ref_end += 1
|
||||||
|
except IndexError:
|
||||||
|
# We are at the end of the string and there's no ';'
|
||||||
|
pass
|
||||||
|
|
||||||
if self.last_word_end is None:
|
if self.last_word_end is None:
|
||||||
if self._word_prefix_regex.match(char):
|
if self._word_prefix_regex.match(char):
|
||||||
|
|
@ -564,19 +595,34 @@ class _HTMLWordTruncator(HTMLParser):
|
||||||
self.add_last_word()
|
self.add_last_word()
|
||||||
|
|
||||||
def handle_entityref(self, name):
|
def handle_entityref(self, name):
|
||||||
|
"""
|
||||||
|
Called when an entity ref like '—' is found
|
||||||
|
|
||||||
|
`name` is the entity ref without ampersand and semicolon (e.g. `mdash`)
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
codepoint = html_entities.name2codepoint[name]
|
codepoint = html_entities.name2codepoint[name]
|
||||||
|
char = six.unichr(codepoint)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
self.handle_ref('')
|
char = ''
|
||||||
else:
|
self._handle_ref(name, char)
|
||||||
self.handle_ref(six.unichr(codepoint))
|
|
||||||
|
|
||||||
def handle_charref(self, name):
|
def handle_charref(self, name):
|
||||||
if name.startswith('x'):
|
"""
|
||||||
codepoint = int(name[1:], 16)
|
Called when a char ref like '—' or '—' is found
|
||||||
else:
|
|
||||||
codepoint = int(name)
|
`name` is the char ref without ampersand and semicolon (e.g. `#8212` or
|
||||||
self.handle_ref(six.unichr(codepoint))
|
`#x2014`)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if name.startswith('x'):
|
||||||
|
codepoint = int(name[1:], 16)
|
||||||
|
else:
|
||||||
|
codepoint = int(name)
|
||||||
|
char = six.unichr(codepoint)
|
||||||
|
except (ValueError, OverflowError):
|
||||||
|
char = ''
|
||||||
|
self._handle_ref('#' + name, char)
|
||||||
|
|
||||||
|
|
||||||
def truncate_html_words(s, num, end_text='…'):
|
def truncate_html_words(s, num, end_text='…'):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue