From d583efb8616cf19401865b04c4270b1a41a96d7f Mon Sep 17 00:00:00 2001 From: Andrea Corbellini Date: Fri, 4 Sep 2015 16:49:41 +0200 Subject: [PATCH] When truncating, stop parsing the document immediately after finding the last word. --- pelican/utils.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/pelican/utils.py b/pelican/utils.py index 7ad0914c..97768f53 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -414,6 +414,13 @@ class _HTMLWordTruncator(HTMLParser): _singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input') + class TruncationCompleted(Exception): + + def __init__(self, truncate_at): + super(_HTMLWordTruncator.TruncationCompleted, self).__init__( + truncate_at) + self.truncate_at = truncate_at + def __init__(self, max_words): # In Python 2, HTMLParser is not a new-style class, # hence super() cannot be used. @@ -425,6 +432,16 @@ class _HTMLWordTruncator(HTMLParser): self.last_word_end = None self.truncate_at = None + def feed(self, *args, **kwargs): + try: + # With Python 2, super() cannot be used. + # See the comment for __init__(). + HTMLParser.feed(self, *args, **kwargs) + except self.TruncationCompleted as exc: + self.truncate_at = exc.truncate_at + else: + self.truncate_at = None + def getoffset(self): line_start = 0 lineno, line_offset = self.getpos() @@ -436,22 +453,18 @@ class _HTMLWordTruncator(HTMLParser): self.words_found += 1 self.last_word_end = None if self.words_found == self.max_words: - self.truncate_at = word_end + raise self.TruncationCompleted(word_end) def add_last_word(self): if self.last_word_end is not None: self.add_word(self.last_word_end) def handle_starttag(self, tag, attrs): - if self.truncate_at is not None: - return self.add_last_word() if tag not in self._singlets: self.open_tags.insert(0, tag) def handle_endtag(self, tag): - if self.truncate_at is not None: - return self.add_last_word() try: i = self.open_tags.index(tag)