Merge pull request #1802 from andreacorbellini/better-word-recognition

When truncating, consider hyphens, apostrophes and HTML entities.
2025-10-15 20:28:56 +02:00 · 2015-08-28 10:18:44 -07:00 · 2015-08-28 10:18:44 -07:00 · ac151ef867
commit ac151ef867
parent d30ffcf52f 9d0804de7a
10 changed files with 95 additions and 24 deletions
--- a/pelican/tests/output/basic/category/misc.html
+++ b/pelican/tests/output/basic/category/misc.html
@ -90,7 +90,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                <a class="readmore" href="/unbelievable.html">read more</a>
                </div><!-- /.entry-content -->
            </article></li>
--- a/pelican/tests/output/basic/index.html
+++ b/pelican/tests/output/basic/index.html
@ -227,7 +227,7 @@ YEAH !</p>
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                <a class="readmore" href="/unbelievable.html">read more</a>
                </div><!-- /.entry-content -->
            </article></li>
--- a/pelican/tests/output/custom/author/alexis-metaireau3.html
+++ b/pelican/tests/output/custom/author/alexis-metaireau3.html
@ -59,7 +59,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                <a class="readmore" href="../unbelievable.html">read more</a>
 <p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
            </article></li>
--- a/pelican/tests/output/custom/category/misc.html
+++ b/pelican/tests/output/custom/category/misc.html
@ -103,7 +103,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                <a class="readmore" href="../unbelievable.html">read more</a>
 <p>There are <a href="../unbelievable.html#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
            </article></li>
--- a/pelican/tests/output/custom/index3.html
+++ b/pelican/tests/output/custom/index3.html
@ -59,7 +59,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                <a class="readmore" href="./unbelievable.html">read more</a>
 <p>There are <a href="./unbelievable.html#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
            </article></li>
--- a/pelican/tests/output/custom_locale/author/alexis-metaireau3.html
+++ b/pelican/tests/output/custom_locale/author/alexis-metaireau3.html
@ -59,7 +59,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                <a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
 <p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
            </article></li>
@ -135,4 +135,4 @@ pelican.conf, it ...</p></div>
    }());
 </script>
 </body>
-</html>
+</html>
--- a/pelican/tests/output/custom_locale/category/misc.html
+++ b/pelican/tests/output/custom_locale/category/misc.html
@ -103,7 +103,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                <a class="readmore" href="../posts/2010/octobre/15/unbelievable/">read more</a>
 <p>There are <a href="../posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
            </article></li>
@ -175,4 +175,4 @@ pelican.conf, it ...</p></div>
    }());
 </script>
 </body>
-</html>
+</html>
--- a/pelican/tests/output/custom_locale/index3.html
+++ b/pelican/tests/output/custom_locale/index3.html
@ -59,7 +59,7 @@
 <div class="section" id="testing-another-case">
 <h2>Testing another case</h2>
 <p>This will now have a line number in 'custom' since it's the default in
-pelican.conf, it ...</p></div>
+pelican.conf, it will ...</p></div>
                <a class="readmore" href="./posts/2010/octobre/15/unbelievable/">read more</a>
 <p>There are <a href="./posts/2010/octobre/15/unbelievable/#disqus_thread">comments</a>.</p>                </div><!-- /.entry-content -->
            </article></li>
@ -135,4 +135,4 @@ pelican.conf, it ...</p></div>
    }());
 </script>
 </body>
-</html>
+</html>
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@ -146,31 +146,51 @@ class TestUtils(LoggedTestCase):
            self.assertEqual(utils.get_relative_path(value), expected)

    def test_truncate_html_words(self):
+        # Plain text.
        self.assertEqual(
            utils.truncate_html_words('short string', 20),
            'short string')
-
        self.assertEqual(
            utils.truncate_html_words('word ' * 100, 20),
            'word ' * 20 + '...')

+        # Words enclosed or intervaled by HTML tags.
        self.assertEqual(
            utils.truncate_html_words('<p>' + 'word ' * 100 + '</p>', 20),
            '<p>' + 'word ' * 20 + '...</p>')
-
        self.assertEqual(
            utils.truncate_html_words(
                '<span\nstyle="\n...\n">' + 'word ' * 100 + '</span>', 20),
            '<span\nstyle="\n...\n">' + 'word ' * 20 + '...</span>')
-
        self.assertEqual(
            utils.truncate_html_words('<br>' + 'word ' * 100, 20),
            '<br>' + 'word ' * 20 + '...')
-
        self.assertEqual(
            utils.truncate_html_words('<!-- comment -->' + 'word ' * 100, 20),
            '<!-- comment -->' + 'word ' * 20 + '...')

+        # Words with hypens and apostrophes.
+        self.assertEqual(
+            utils.truncate_html_words("a-b " * 100, 20),
+            "a-b " * 20 + '...')
+        self.assertEqual(
+            utils.truncate_html_words("it's " * 100, 20),
+            "it's " * 20 + '...')
+
+        # Words with HTML entity references.
+        self.assertEqual(
+            utils.truncate_html_words("&eacute; " * 100, 20),
+            "&eacute; " * 20 + '...')
+        self.assertEqual(
+            utils.truncate_html_words("caf&eacute; " * 100, 20),
+            "caf&eacute; " * 20 + '...')
+        self.assertEqual(
+            utils.truncate_html_words("&egrave;lite " * 100, 20),
+            "&egrave;lite " * 20 + '...')
+        self.assertEqual(
+            utils.truncate_html_words("cafeti&eacute;re " * 100, 20),
+            "cafeti&eacute;re " * 20 + '...')
+
    def test_process_translations(self):
        # create a bunch of articles
        # 1: no translation metadata
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -25,6 +25,7 @@ from jinja2 import Markup
 import pytz

 import six
+from six.moves import html_entities
 from six.moves.html_parser import HTMLParser

 logger = logging.getLogger(__name__)
@ -408,7 +409,8 @@ def posixize_path(rel_path):

 class _HTMLWordTruncator(HTMLParser):

-    _word_regex = re.compile(r'\w[\w-]*', re.U)
+    _word_regex = re.compile(r"\w[\w'-]*", re.U)
+    _word_prefix_regex = re.compile(r'\w', re.U)
    _singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area',
                 'hr', 'input')

@ -420,17 +422,37 @@ class _HTMLWordTruncator(HTMLParser):
        self.max_words = max_words
        self.words_found = 0
        self.open_tags = []
+        self.last_word_end = None
        self.truncate_at = None

+    def getoffset(self):
+        line_start = 0
+        lineno, line_offset = self.getpos()
+        for i in range(lineno - 1):
+            line_start = self.rawdata.index('\n', line_start) + 1
+        return line_start + line_offset
+
+    def add_word(self, word_end):
+        self.words_found += 1
+        self.last_word_end = None
+        if self.words_found == self.max_words:
+            self.truncate_at = word_end
+
+    def add_last_word(self):
+        if self.last_word_end is not None:
+            self.add_word(self.last_word_end)
+
    def handle_starttag(self, tag, attrs):
        if self.truncate_at is not None:
            return
+        self.add_last_word()
        if tag not in self._singlets:
            self.open_tags.insert(0, tag)

    def handle_endtag(self, tag):
        if self.truncate_at is not None:
            return
+        self.add_last_word()
        try:
            i = self.open_tags.index(tag)
        except ValueError:
@ -442,20 +464,49 @@ class _HTMLWordTruncator(HTMLParser):

    def handle_data(self, data):
        word_end = 0
+        offset = self.getoffset()

        while self.words_found < self.max_words:
            match = self._word_regex.search(data, word_end)
            if not match:
                break
-            word_end = match.end(0)
-            self.words_found += 1

-            if self.words_found == self.max_words:
-                line_start = 0
-                lineno, line_offset = self.getpos()
-                for i in range(lineno - 1):
-                    line_start = self.rawdata.index('\n', line_start) + 1
-                self.truncate_at = line_start + line_offset + word_end
+            if match.start(0) > 0:
+                self.add_last_word()
+
+            word_end = match.end(0)
+            self.last_word_end = offset + word_end
+
+        if word_end < len(data):
+            self.add_last_word()
+
+    def handle_ref(self, char):
+        offset = self.getoffset()
+        ref_end = self.rawdata.index(';', offset) + 1
+
+        if self.last_word_end is None:
+            if self._word_prefix_regex.match(char):
+                self.last_word_end = ref_end
+        else:
+            if self._word_regex.match(char):
+                self.last_word_end = ref_end
+            else:
+                self.add_last_word()
+
+    def handle_entityref(self, name):
+        try:
+            codepoint = html_entities.name2codepoint[name]
+        except KeyError:
+            self.handle_ref('')
+        else:
+            self.handle_ref(chr(codepoint))
+
+    def handle_charref(self, name):
+        if name.startswith('x'):
+            codepoint = int(name[1:], 16)
+        else:
+            codepoint = int(name)
+        self.handle_ref(chr(codepoint))


 def truncate_html_words(s, num, end_text='...'):