Merge pull request #1813 from andreacorbellini/faster-truncation

When truncating, stop parsing the document immediately after finding the last word
2025-10-15 20:28:56 +02:00 · 2015-09-23 08:58:55 -07:00 · 2015-09-23 08:58:55 -07:00 · b152aba6c6
commit b152aba6c6
parent 178902a692 d583efb861
1 changed files with 18 additions and 5 deletions
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -414,6 +414,13 @@ class _HTMLWordTruncator(HTMLParser):
    _singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area',
                 'hr', 'input')
    class TruncationCompleted(Exception):
        def __init__(self, truncate_at):
            super(_HTMLWordTruncator.TruncationCompleted, self).__init__(
                truncate_at)
            self.truncate_at = truncate_at
    def __init__(self, max_words):
        # In Python 2, HTMLParser is not a new-style class,
        # hence super() cannot be used.
@ -425,6 +432,16 @@ class _HTMLWordTruncator(HTMLParser):
        self.last_word_end = None
        self.truncate_at = None
    def feed(self, *args, **kwargs):
        try:
            # With Python 2, super() cannot be used.
            # See the comment for __init__().
            HTMLParser.feed(self, *args, **kwargs)
        except self.TruncationCompleted as exc:
            self.truncate_at = exc.truncate_at
        else:
            self.truncate_at = None
    def getoffset(self):
        line_start = 0
        lineno, line_offset = self.getpos()
@ -436,22 +453,18 @@ class _HTMLWordTruncator(HTMLParser):
        self.words_found += 1
        self.last_word_end = None
        if self.words_found == self.max_words:
-            self.truncate_at = word_end
+            raise self.TruncationCompleted(word_end)
    def add_last_word(self):
        if self.last_word_end is not None:
            self.add_word(self.last_word_end)
    def handle_starttag(self, tag, attrs):
        if self.truncate_at is not None:
            return
        self.add_last_word()
        if tag not in self._singlets:
            self.open_tags.insert(0, tag)
    def handle_endtag(self, tag):
        if self.truncate_at is not None:
            return
        self.add_last_word()
        try:
            i = self.open_tags.index(tag)