mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
Merge pull request #2285 from andreacorbellini/2263-handle-invalid-refs
Fix utils.truncate_html_words() to work with invalid HTML references
This commit is contained in:
commit
b4d5e4285e
2 changed files with 70 additions and 10 deletions
|
|
@ -217,6 +217,20 @@ class TestUtils(LoggedTestCase):
|
|||
utils.truncate_html_words("∫dx " * 100, 20),
|
||||
"∫dx " * 20 + '…')
|
||||
|
||||
# Words with invalid or broken HTML references.
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words('&invalid;', 20), '&invalid;')
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words('�', 20), '�')
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words('�', 20), '�')
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words('&mdash text', 20), '&mdash text')
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words('Ӓ text', 20), 'Ӓ text')
|
||||
self.assertEqual(
|
||||
utils.truncate_html_words('઼ text', 20), '઼ text')
|
||||
|
||||
def test_process_translations(self):
|
||||
fr_articles = []
|
||||
en_articles = []
|
||||
|
|
|
|||
|
|
@ -550,9 +550,40 @@ class _HTMLWordTruncator(HTMLParser):
|
|||
if word_end < len(data):
|
||||
self.add_last_word()
|
||||
|
||||
def handle_ref(self, char):
|
||||
def _handle_ref(self, name, char):
|
||||
"""
|
||||
Called by handle_entityref() or handle_charref() when a ref like
|
||||
`—`, `—`, or `—` is found.
|
||||
|
||||
The arguments for this method are:
|
||||
|
||||
- `name`: the HTML entity name (such as `mdash` or `#8212` or `#x2014`)
|
||||
- `char`: the Unicode representation of the ref (such as `—`)
|
||||
|
||||
This method checks whether the entity is considered to be part of a
|
||||
word or not and, if not, signals the end of a word.
|
||||
"""
|
||||
# Compute the index of the character right after the ref.
|
||||
#
|
||||
# In a string like 'prefix—suffix', the end is the sum of:
|
||||
#
|
||||
# - `self.getoffset()` (the length of `prefix`)
|
||||
# - `1` (the length of `&`)
|
||||
# - `len(name)` (the length of `mdash`)
|
||||
# - `1` (the length of `;`)
|
||||
#
|
||||
# Note that, in case of malformed HTML, the ';' character may
|
||||
# not be present.
|
||||
|
||||
offset = self.getoffset()
|
||||
ref_end = self.rawdata.index(';', offset) + 1
|
||||
ref_end = offset + len(name) + 1
|
||||
|
||||
try:
|
||||
if self.rawdata[ref_end] == ';':
|
||||
ref_end += 1
|
||||
except IndexError:
|
||||
# We are at the end of the string and there's no ';'
|
||||
pass
|
||||
|
||||
if self.last_word_end is None:
|
||||
if self._word_prefix_regex.match(char):
|
||||
|
|
@ -564,19 +595,34 @@ class _HTMLWordTruncator(HTMLParser):
|
|||
self.add_last_word()
|
||||
|
||||
def handle_entityref(self, name):
|
||||
"""
|
||||
Called when an entity ref like '—' is found
|
||||
|
||||
`name` is the entity ref without ampersand and semicolon (e.g. `mdash`)
|
||||
"""
|
||||
try:
|
||||
codepoint = html_entities.name2codepoint[name]
|
||||
char = six.unichr(codepoint)
|
||||
except KeyError:
|
||||
self.handle_ref('')
|
||||
else:
|
||||
self.handle_ref(six.unichr(codepoint))
|
||||
char = ''
|
||||
self._handle_ref(name, char)
|
||||
|
||||
def handle_charref(self, name):
|
||||
if name.startswith('x'):
|
||||
codepoint = int(name[1:], 16)
|
||||
else:
|
||||
codepoint = int(name)
|
||||
self.handle_ref(six.unichr(codepoint))
|
||||
"""
|
||||
Called when a char ref like '—' or '—' is found
|
||||
|
||||
`name` is the char ref without ampersand and semicolon (e.g. `#8212` or
|
||||
`#x2014`)
|
||||
"""
|
||||
try:
|
||||
if name.startswith('x'):
|
||||
codepoint = int(name[1:], 16)
|
||||
else:
|
||||
codepoint = int(name)
|
||||
char = six.unichr(codepoint)
|
||||
except (ValueError, OverflowError):
|
||||
char = ''
|
||||
self._handle_ref('#' + name, char)
|
||||
|
||||
|
||||
def truncate_html_words(s, num, end_text='…'):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue