Improve word count behavior when generating summary

Improve _HTMLWordTruncator by using more than one unicode block in _word_regex, making word count function behave properly with CJK, Cyrillic, and more Latin characters when generating summary.
2025-10-15 20:28:56 +02:00 · 2021-09-29 14:44:47 +08:00 · 2021-09-29 14:44:47 +08:00 · 22192c148a
commit 22192c148a
parent a088f8bb9e
2 changed files with 31 additions and 3 deletions
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@ -218,6 +218,19 @@ class TestUtils(LoggedTestCase):
            utils.truncate_html_words('word ' * 100, 20),
            'word ' * 20 + '…')

+        # Plain text with Unicode content.
+        self.assertEqual(
+            utils.truncate_html_words(
+                '我愿意这样，朋友——我独自远行，不但没有你，\
+                 并且再没有别的影在黑暗里。', 12
+            ),
+            '我愿意这样，朋友——我独自远行' + ' …')
+        self.assertEqual(
+            utils.truncate_html_words(
+                'Ты мелькнула, ты предстала, Снова сердце задрожало,', 3
+            ),
+            'Ты мелькнула, ты' + ' …')
+
        # Words enclosed or intervaled by HTML tags.
        self.assertEqual(
            utils.truncate_html_words('<p>' + 'word ' * 100 + '</p>', 20),