Fix #2982: Improve _HTMLWordTruncator (#3002)

2022-07-12 02:47:37 +09:00 · 2022-07-12 02:47:37 +09:00 · d5d792060c
commit d5d792060c
parent 5c222ef41b
3 changed files with 11 additions and 4 deletions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -0,0 +1,5 @@
+Release type: minor
+
+* Use JSON values for extra settings in Invoke tasks template (#2994)
+* Add content tag for links, which can help with things like Twitter social cards (#3001)
+* Improve word count behavior when generating summary (#3002)
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@ -230,6 +230,11 @@ class TestUtils(LoggedTestCase):
                'Ты мелькнула, ты предстала, Снова сердце задрожало,', 3
            ),
            'Ты мелькнула, ты' + ' …')
+        self.assertEqual(
+            utils.truncate_html_words(
+                'Trong đầm gì đẹp bằng sen', 4
+            ),
+            'Trong đầm gì đẹp' + ' …')

        # Words enclosed or intervaled by HTML tags.
        self.assertEqual(
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -412,10 +412,7 @@ def posixize_path(rel_path):

 class _HTMLWordTruncator(HTMLParser):

-    _word_regex = re.compile(r"(({SBC})({SBC}|-|')*)|{DBC}".format(
-        # SBC means Latin-like characters. A word contains a few characters.
-        #         ASCII |Extended Latin | Cyrillic
-        SBC="[0-9a-zA-Z]|[\u00C0-\u024f]|[\u0400-\u04FF]",
+    _word_regex = re.compile(r"{DBC}|(\w[\w'-]*)".format(
        # DBC means CJK-like characters. An character can stand for a word.
        DBC=("([\u4E00-\u9FFF])|"          # CJK Unified Ideographs
             "([\u3400-\u4DBF])|"          # CJK Unified Ideographs Extension A