Fix #2982: Improve _HTMLWordTruncator

2025-10-15 20:28:56 +02:00 · 2022-05-02 22:12:23 +09:00 · 2022-05-02 22:12:23 +09:00 · 747fec5b22
commit 747fec5b22
parent e8d6318e93
3 changed files with 9 additions and 4 deletions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -0,0 +1,3 @@
+Release type: minor
+
+Improve word count behavior when generating summary.
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@ -230,6 +230,11 @@ class TestUtils(LoggedTestCase):
                'Ты мелькнула, ты предстала, Снова сердце задрожало,', 3
            ),
            'Ты мелькнула, ты' + ' …')
+        self.assertEqual(
+            utils.truncate_html_words(
+                'Trong đầm gì đẹp bằng sen', 4
+            ),
+            'Trong đầm gì đẹp' + ' …')

        # Words enclosed or intervaled by HTML tags.
        self.assertEqual(
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -412,10 +412,7 @@ def posixize_path(rel_path):

 class _HTMLWordTruncator(HTMLParser):

-    _word_regex = re.compile(r"(({SBC})({SBC}|-|')*)|{DBC}".format(
-        # SBC means Latin-like characters. A word contains a few characters.
-        #         ASCII |Extended Latin | Cyrillic
-        SBC="[0-9a-zA-Z]|[\u00C0-\u024f]|[\u0400-\u04FF]",
+    _word_regex = re.compile(r"{DBC}|(\w[\w'-]*)".format(
        # DBC means CJK-like characters. An character can stand for a word.
        DBC=("([\u4E00-\u9FFF])|"          # CJK Unified Ideographs
             "([\u3400-\u4DBF])|"          # CJK Unified Ideographs Extension A