When truncating, stop parsing the document immediately after finding the last word.

2025-10-15 20:28:56 +02:00 · 2015-09-04 16:49:41 +02:00 · 2015-09-04 16:49:41 +02:00 · d583efb861
commit d583efb861
parent a6c258eb7f
1 changed files with 18 additions and 5 deletions
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -414,6 +414,13 @@ class _HTMLWordTruncator(HTMLParser):
    _singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area',
                 'hr', 'input')

+    class TruncationCompleted(Exception):
+
+        def __init__(self, truncate_at):
+            super(_HTMLWordTruncator.TruncationCompleted, self).__init__(
+                truncate_at)
+            self.truncate_at = truncate_at
+
    def __init__(self, max_words):
        # In Python 2, HTMLParser is not a new-style class,
        # hence super() cannot be used.
@ -425,6 +432,16 @@ class _HTMLWordTruncator(HTMLParser):
        self.last_word_end = None
        self.truncate_at = None

+    def feed(self, *args, **kwargs):
+        try:
+            # With Python 2, super() cannot be used.
+            # See the comment for __init__().
+            HTMLParser.feed(self, *args, **kwargs)
+        except self.TruncationCompleted as exc:
+            self.truncate_at = exc.truncate_at
+        else:
+            self.truncate_at = None
+
    def getoffset(self):
        line_start = 0
        lineno, line_offset = self.getpos()
@ -436,22 +453,18 @@ class _HTMLWordTruncator(HTMLParser):
        self.words_found += 1
        self.last_word_end = None
        if self.words_found == self.max_words:
-            self.truncate_at = word_end
+            raise self.TruncationCompleted(word_end)

    def add_last_word(self):
        if self.last_word_end is not None:
            self.add_word(self.last_word_end)

    def handle_starttag(self, tag, attrs):
-        if self.truncate_at is not None:
-            return
        self.add_last_word()
        if tag not in self._singlets:
            self.open_tags.insert(0, tag)

    def handle_endtag(self, tag):
-        if self.truncate_at is not None:
-            return
        self.add_last_word()
        try:
            i = self.open_tags.index(tag)