From 9d0804de7af858880e5ef74f0c1c5d8f5ad6419b Mon Sep 17 00:00:00 2001
From: Andrea Corbellini <corbellini.andrea@gmail.com>
Date: Wed, 19 Aug 2015 16:43:59 +0200
Subject: [PATCH] When truncating, consider hypens, apostrophes and HTML
 entities.

---
 pelican/tests/output/basic/category/misc.html |  2 +-
 pelican/tests/output/basic/index.html         |  2 +-
 .../custom/author/alexis-metaireau3.html      |  2 +-
 .../tests/output/custom/category/misc.html    |  2 +-
 pelican/tests/output/custom/index3.html       |  2 +-
 .../author/alexis-metaireau3.html             |  4 +-
 .../output/custom_locale/category/misc.html   |  4 +-
 .../tests/output/custom_locale/index3.html    |  4 +-
 pelican/tests/test_utils.py                   | 28 ++++++--
 pelican/utils.py                              | 69 ++++++++++++++++---
 10 files changed, 95 insertions(+), 24 deletions(-)
diff --git a/pelican/tests/output/basic/category/misc.html b/pelican/tests/output/basic/category/misc.html
index 0368793e..f491a464 100644
--- a/pelican/tests/output/basic/category/misc.html
+++ b/pelican/tests/output/basic/category/misc.html
@@ -90,7 +90,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                 <a class="readmore" href="/unbelievable.html">read more</a>
                 </div><!-- /.entry-content -->
             </article></li>
diff --git a/pelican/tests/output/basic/index.html b/pelican/tests/output/basic/index.html
index 3066172d..4c74500d 100644
--- a/pelican/tests/output/basic/index.html
+++ b/pelican/tests/output/basic/index.html
@@ -227,7 +227,7 @@ YEAH !</p>
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                 <a class="readmore" href="/unbelievable.html">read more</a>
                 </div><!-- /.entry-content -->
             </article></li>
diff --git a/pelican/tests/output/custom/author/alexis-metaireau3.html b/pelican/tests/output/custom/author/alexis-metaireau3.html
index 54c768ac..3ca4dd0d 100644
--- a/pelican/tests/output/custom/author/alexis-metaireau3.html
+++ b/pelican/tests/output/custom/author/alexis-metaireau3.html
@@ -59,7 +59,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                 <a class="readmore" href="../unbelievable.html">read more</a>
 <p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
             </article></li>
diff --git a/pelican/tests/output/custom/category/misc.html b/pelican/tests/output/custom/category/misc.html
index fa71085d..b705a552 100644
--- a/pelican/tests/output/custom/category/misc.html
+++ b/pelican/tests/output/custom/category/misc.html
@@ -103,7 +103,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                 <a class="readmore" href="../unbelievable.html">read more</a>
 <p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
             </article></li>
diff --git a/pelican/tests/output/custom/index3.html b/pelican/tests/output/custom/index3.html
index 1dab4e7d..b968b7e8 100644
--- a/pelican/tests/output/custom/index3.html
+++ b/pelican/tests/output/custom/index3.html
@@ -59,7 +59,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                 <a class="readmore" href="./unbelievable.html">read more</a>
 <p>There are <a href="./unbelievable.html#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
             </article></li>
diff --git a/pelican/tests/output/custom_locale/author/alexis-metaireau3.html b/pelican/tests/output/custom_locale/author/alexis-metaireau3.html
index 66575c71..2fea24c3 100644
--- a/pelican/tests/output/custom_locale/author/alexis-metaireau3.html
+++ b/pelican/tests/output/custom_locale/author/alexis-metaireau3.html
@@ -59,7 +59,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                 <a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
 <p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
             </article></li>
@@ -135,4 +135,4 @@ pelican.conf, it ...</p></div>
     }());
 </script>
 </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/pelican/tests/output/custom_locale/category/misc.html b/pelican/tests/output/custom_locale/category/misc.html
index bb78a8cc..f44f725d 100644
--- a/pelican/tests/output/custom_locale/category/misc.html
+++ b/pelican/tests/output/custom_locale/category/misc.html
@@ -103,7 +103,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                 <a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
 <p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
             </article></li>
@@ -175,4 +175,4 @@ pelican.conf, it ...</p></div>
     }());
 </script>
 </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/pelican/tests/output/custom_locale/index3.html b/pelican/tests/output/custom_locale/index3.html
index 49f70ba2..926bc25e 100644
--- a/pelican/tests/output/custom_locale/index3.html
+++ b/pelican/tests/output/custom_locale/index3.html
@@ -59,7 +59,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                 <a class="readmore" href="./posts/2010/octobre/15/unbelievable/">read more</a>
 <p>There are <a href="./posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
             </article></li>
@@ -135,4 +135,4 @@ pelican.conf, it ...</p></div>
     }());
 </script>
 </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py
index d6fdf70e..a076a2c7 100644
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@@ -146,31 +146,51 @@ class TestUtils(LoggedTestCase):
             self.assertEqual(utils.get_relative_path(value), expected)
 
     def test_truncate_html_words(self):
+        # Plain text.
         self.assertEqual(
             utils.truncate_html_words('short string', 20),
             'short string')
-
         self.assertEqual(
             utils.truncate_html_words('word ' * 100, 20),
             'word ' * 20 + '...')
 
+        # Words enclosed or intervaled by HTML tags.
         self.assertEqual(
             utils.truncate_html_words('<p>' + 'word ' * 100 + '</p>', 20),
             '<p>' + 'word ' * 20 + '...</p>')
-
         self.assertEqual(
             utils.truncate_html_words(
                 '<span\nstyle="\n...\n">' + 'word ' * 100 + '</span>', 20),
             '<span\nstyle="\n...\n">' + 'word ' * 20 + '...</span>')
-
         self.assertEqual(
             utils.truncate_html_words('<br>' + 'word ' * 100, 20),
             '<br>' + 'word ' * 20 + '...')
-
         self.assertEqual(
             utils.truncate_html_words('<!-- comment -->' + 'word ' * 100, 20),
             '<!-- comment -->' + 'word ' * 20 + '...')
 
+        # Words with hypens and apostrophes.
+        self.assertEqual(
+            utils.truncate_html_words("a-b " * 100, 20),
+            "a-b " * 20 + '...')
+        self.assertEqual(
+            utils.truncate_html_words("it's " * 100, 20),
+            "it's " * 20 + '...')
+
+        # Words with HTML entity references.
+        self.assertEqual(
+            utils.truncate_html_words("&eacute; " * 100, 20),
+            "&eacute; " * 20 + '...')
+        self.assertEqual(
+            utils.truncate_html_words("caf&eacute; " * 100, 20),
+            "caf&eacute; " * 20 + '...')
+        self.assertEqual(
+            utils.truncate_html_words("&egrave;lite " * 100, 20),
+            "&egrave;lite " * 20 + '...')
+        self.assertEqual(
+            utils.truncate_html_words("cafeti&eacute;re " * 100, 20),
+            "cafeti&eacute;re " * 20 + '...')
+
     def test_process_translations(self):
         # create a bunch of articles
         # 1: no translation metadata
diff --git a/pelican/utils.py b/pelican/utils.py
index 786a9425..7ad0914c 100644
--- a/pelican/utils.py
+++ b/pelican/utils.py
@@ -25,6 +25,7 @@ from jinja2 import Markup
 import pytz
 
 import six
+from six.moves import html_entities
 from six.moves.html_parser import HTMLParser
 
 logger = logging.getLogger(__name__)
@@ -408,7 +409,8 @@ def posixize_path(rel_path):
 
 class _HTMLWordTruncator(HTMLParser):
 
-    _word_regex = re.compile(r'\w[\w-]*', re.U)
+    _word_regex = re.compile(r"\w[\w'-]*", re.U)
+    _word_prefix_regex = re.compile(r'\w', re.U)
     _singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area',
                  'hr', 'input')
 
@@ -420,17 +422,37 @@ class _HTMLWordTruncator(HTMLParser):
         self.max_words = max_words
         self.words_found = 0
         self.open_tags = []
+        self.last_word_end = None
         self.truncate_at = None
 
+    def getoffset(self):
+        line_start = 0
+        lineno, line_offset = self.getpos()
+        for i in range(lineno - 1):
+            line_start = self.rawdata.index('\n', line_start) + 1
+        return line_start + line_offset
+
+    def add_word(self, word_end):
+        self.words_found += 1
+        self.last_word_end = None
+        if self.words_found == self.max_words:
+            self.truncate_at = word_end
+
+    def add_last_word(self):
+        if self.last_word_end is not None:
+            self.add_word(self.last_word_end)
+
     def handle_starttag(self, tag, attrs):
         if self.truncate_at is not None:
             return
+        self.add_last_word()
         if tag not in self._singlets:
             self.open_tags.insert(0, tag)
 
     def handle_endtag(self, tag):
         if self.truncate_at is not None:
             return
+        self.add_last_word()
         try:
             i = self.open_tags.index(tag)
         except ValueError:
@@ -442,20 +464,49 @@ class _HTMLWordTruncator(HTMLParser):
 
     def handle_data(self, data):
         word_end = 0
+        offset = self.getoffset()
 
         while self.words_found < self.max_words:
             match = self._word_regex.search(data, word_end)
             if not match:
                 break
-            word_end = match.end(0)
-            self.words_found += 1
 
-            if self.words_found == self.max_words:
-                line_start = 0
-                lineno, line_offset = self.getpos()
-                for i in range(lineno - 1):
-                    line_start = self.rawdata.index('\n', line_start) + 1
-                self.truncate_at = line_start + line_offset + word_end
+            if match.start(0) > 0:
+                self.add_last_word()
+
+            word_end = match.end(0)
+            self.last_word_end = offset + word_end
+
+        if word_end < len(data):
+            self.add_last_word()
+
+    def handle_ref(self, char):
+        offset = self.getoffset()
+        ref_end = self.rawdata.index(';', offset) + 1
+
+        if self.last_word_end is None:
+            if self._word_prefix_regex.match(char):
+                self.last_word_end = ref_end
+        else:
+            if self._word_regex.match(char):
+                self.last_word_end = ref_end
+            else:
+                self.add_last_word()
+
+    def handle_entityref(self, name):
+        try:
+            codepoint = html_entities.name2codepoint[name]
+        except KeyError:
+            self.handle_ref('')
+        else:
+            self.handle_ref(chr(codepoint))
+
+    def handle_charref(self, name):
+        if name.startswith('x'):
+            codepoint = int(name[1:], 16)
+        else:
+            codepoint = int(name)
+        self.handle_ref(chr(codepoint))
 
 
 def truncate_html_words(s, num, end_text='...'):