From 9d0804de7af858880e5ef74f0c1c5d8f5ad6419b Mon Sep 17 00:00:00 2001 From: Andrea Corbellini Date: Wed, 19 Aug 2015 16:43:59 +0200 Subject: [PATCH] When truncating, consider hypens, apostrophes and HTML entities. --- pelican/tests/output/basic/category/misc.html | 2 +- pelican/tests/output/basic/index.html | 2 +- .../custom/author/alexis-metaireau3.html | 2 +- .../tests/output/custom/category/misc.html | 2 +- pelican/tests/output/custom/index3.html | 2 +- .../author/alexis-metaireau3.html | 4 +- .../output/custom_locale/category/misc.html | 4 +- .../tests/output/custom_locale/index3.html | 4 +- pelican/tests/test_utils.py | 28 ++++++-- pelican/utils.py | 69 ++++++++++++++++--- 10 files changed, 95 insertions(+), 24 deletions(-) diff --git a/pelican/tests/output/basic/category/misc.html b/pelican/tests/output/basic/category/misc.html index 0368793e..f491a464 100644 --- a/pelican/tests/output/basic/category/misc.html +++ b/pelican/tests/output/basic/category/misc.html @@ -90,7 +90,7 @@

Testing another case

This will now have a line number in 'custom' since it's the default in -pelican.conf, it ...

+pelican.conf, it will ...

read more diff --git a/pelican/tests/output/basic/index.html b/pelican/tests/output/basic/index.html index 3066172d..4c74500d 100644 --- a/pelican/tests/output/basic/index.html +++ b/pelican/tests/output/basic/index.html @@ -227,7 +227,7 @@ YEAH !

Testing another case

This will now have a line number in 'custom' since it's the default in -pelican.conf, it ...

+pelican.conf, it will ...

read more diff --git a/pelican/tests/output/custom/author/alexis-metaireau3.html b/pelican/tests/output/custom/author/alexis-metaireau3.html index 54c768ac..3ca4dd0d 100644 --- a/pelican/tests/output/custom/author/alexis-metaireau3.html +++ b/pelican/tests/output/custom/author/alexis-metaireau3.html @@ -59,7 +59,7 @@

Testing another case

This will now have a line number in 'custom' since it's the default in -pelican.conf, it ...

+pelican.conf, it will ...

read more

There are comments.

diff --git a/pelican/tests/output/custom/category/misc.html b/pelican/tests/output/custom/category/misc.html index fa71085d..b705a552 100644 --- a/pelican/tests/output/custom/category/misc.html +++ b/pelican/tests/output/custom/category/misc.html @@ -103,7 +103,7 @@

Testing another case

This will now have a line number in 'custom' since it's the default in -pelican.conf, it ...

+pelican.conf, it will ...

read more

There are comments.

diff --git a/pelican/tests/output/custom/index3.html b/pelican/tests/output/custom/index3.html index 1dab4e7d..b968b7e8 100644 --- a/pelican/tests/output/custom/index3.html +++ b/pelican/tests/output/custom/index3.html @@ -59,7 +59,7 @@

Testing another case

This will now have a line number in 'custom' since it's the default in -pelican.conf, it ...

+pelican.conf, it will ...

read more

There are comments.

diff --git a/pelican/tests/output/custom_locale/author/alexis-metaireau3.html b/pelican/tests/output/custom_locale/author/alexis-metaireau3.html index 66575c71..2fea24c3 100644 --- a/pelican/tests/output/custom_locale/author/alexis-metaireau3.html +++ b/pelican/tests/output/custom_locale/author/alexis-metaireau3.html @@ -59,7 +59,7 @@

Testing another case

This will now have a line number in 'custom' since it's the default in -pelican.conf, it ...

+pelican.conf, it will ...

read more

There are comments.

@@ -135,4 +135,4 @@ pelican.conf, it ...

}()); - \ No newline at end of file + diff --git a/pelican/tests/output/custom_locale/category/misc.html b/pelican/tests/output/custom_locale/category/misc.html index bb78a8cc..f44f725d 100644 --- a/pelican/tests/output/custom_locale/category/misc.html +++ b/pelican/tests/output/custom_locale/category/misc.html @@ -103,7 +103,7 @@

Testing another case

This will now have a line number in 'custom' since it's the default in -pelican.conf, it ...

+pelican.conf, it will ...

read more

There are comments.

@@ -175,4 +175,4 @@ pelican.conf, it ...

}()); - \ No newline at end of file + diff --git a/pelican/tests/output/custom_locale/index3.html b/pelican/tests/output/custom_locale/index3.html index 49f70ba2..926bc25e 100644 --- a/pelican/tests/output/custom_locale/index3.html +++ b/pelican/tests/output/custom_locale/index3.html @@ -59,7 +59,7 @@

Testing another case

This will now have a line number in 'custom' since it's the default in -pelican.conf, it ...

+pelican.conf, it will ...

read more

There are comments.

@@ -135,4 +135,4 @@ pelican.conf, it ...

}()); - \ No newline at end of file + diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index d6fdf70e..a076a2c7 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -146,31 +146,51 @@ class TestUtils(LoggedTestCase): self.assertEqual(utils.get_relative_path(value), expected) def test_truncate_html_words(self): + # Plain text. self.assertEqual( utils.truncate_html_words('short string', 20), 'short string') - self.assertEqual( utils.truncate_html_words('word ' * 100, 20), 'word ' * 20 + '...') + # Words enclosed or intervaled by HTML tags. self.assertEqual( utils.truncate_html_words('

' + 'word ' * 100 + '

', 20), '

' + 'word ' * 20 + '...

') - self.assertEqual( utils.truncate_html_words( '' + 'word ' * 100 + '', 20), '' + 'word ' * 20 + '...') - self.assertEqual( utils.truncate_html_words('
' + 'word ' * 100, 20), '
' + 'word ' * 20 + '...') - self.assertEqual( utils.truncate_html_words('' + 'word ' * 100, 20), '' + 'word ' * 20 + '...') + # Words with hypens and apostrophes. + self.assertEqual( + utils.truncate_html_words("a-b " * 100, 20), + "a-b " * 20 + '...') + self.assertEqual( + utils.truncate_html_words("it's " * 100, 20), + "it's " * 20 + '...') + + # Words with HTML entity references. + self.assertEqual( + utils.truncate_html_words("é " * 100, 20), + "é " * 20 + '...') + self.assertEqual( + utils.truncate_html_words("café " * 100, 20), + "café " * 20 + '...') + self.assertEqual( + utils.truncate_html_words("èlite " * 100, 20), + "èlite " * 20 + '...') + self.assertEqual( + utils.truncate_html_words("cafetiére " * 100, 20), + "cafetiére " * 20 + '...') + def test_process_translations(self): # create a bunch of articles # 1: no translation metadata diff --git a/pelican/utils.py b/pelican/utils.py index 786a9425..7ad0914c 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -25,6 +25,7 @@ from jinja2 import Markup import pytz import six +from six.moves import html_entities from six.moves.html_parser import HTMLParser logger = logging.getLogger(__name__) @@ -408,7 +409,8 @@ def posixize_path(rel_path): class _HTMLWordTruncator(HTMLParser): - _word_regex = re.compile(r'\w[\w-]*', re.U) + _word_regex = re.compile(r"\w[\w'-]*", re.U) + _word_prefix_regex = re.compile(r'\w', re.U) _singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input') @@ -420,17 +422,37 @@ class _HTMLWordTruncator(HTMLParser): self.max_words = max_words self.words_found = 0 self.open_tags = [] + self.last_word_end = None self.truncate_at = None + def getoffset(self): + line_start = 0 + lineno, line_offset = self.getpos() + for i in range(lineno - 1): + line_start = self.rawdata.index('\n', line_start) + 1 + return line_start + line_offset + + def add_word(self, word_end): + self.words_found += 1 + self.last_word_end = None + if self.words_found == self.max_words: + self.truncate_at = word_end + + def add_last_word(self): + if self.last_word_end is not None: + self.add_word(self.last_word_end) + def handle_starttag(self, tag, attrs): if self.truncate_at is not None: return + self.add_last_word() if tag not in self._singlets: self.open_tags.insert(0, tag) def handle_endtag(self, tag): if self.truncate_at is not None: return + self.add_last_word() try: i = self.open_tags.index(tag) except ValueError: @@ -442,20 +464,49 @@ class _HTMLWordTruncator(HTMLParser): def handle_data(self, data): word_end = 0 + offset = self.getoffset() while self.words_found < self.max_words: match = self._word_regex.search(data, word_end) if not match: break - word_end = match.end(0) - self.words_found += 1 - if self.words_found == self.max_words: - line_start = 0 - lineno, line_offset = self.getpos() - for i in range(lineno - 1): - line_start = self.rawdata.index('\n', line_start) + 1 - self.truncate_at = line_start + line_offset + word_end + if match.start(0) > 0: + self.add_last_word() + + word_end = match.end(0) + self.last_word_end = offset + word_end + + if word_end < len(data): + self.add_last_word() + + def handle_ref(self, char): + offset = self.getoffset() + ref_end = self.rawdata.index(';', offset) + 1 + + if self.last_word_end is None: + if self._word_prefix_regex.match(char): + self.last_word_end = ref_end + else: + if self._word_regex.match(char): + self.last_word_end = ref_end + else: + self.add_last_word() + + def handle_entityref(self, name): + try: + codepoint = html_entities.name2codepoint[name] + except KeyError: + self.handle_ref('') + else: + self.handle_ref(chr(codepoint)) + + def handle_charref(self, name): + if name.startswith('x'): + codepoint = int(name[1:], 16) + else: + codepoint = int(name) + self.handle_ref(chr(codepoint)) def truncate_html_words(s, num, end_text='...'):