From d5d792060cd5e4c3ff6da82359589b990fa83f27 Mon Sep 17 00:00:00 2001
From: manhhomienbienthuy <1755796+manhhomienbienthuy@users.noreply.github.com>
Date: Tue, 12 Jul 2022 02:47:37 +0900
Subject: [PATCH] Fix #2982: Improve _HTMLWordTruncator (#3002)

---
 RELEASE.md                  | 5 +++++
 pelican/tests/test_utils.py | 5 +++++
 pelican/utils.py            | 5 +----
 3 files changed, 11 insertions(+), 4 deletions(-)
 create mode 100644 RELEASE.md

diff --git a/RELEASE.md b/RELEASE.md
new file mode 100644
index 00000000..36f865f8
--- /dev/null
+++ b/RELEASE.md
@@ -0,0 +1,5 @@
+Release type: minor
+
+* Use JSON values for extra settings in Invoke tasks template (#2994)
+* Add content tag for links, which can help with things like Twitter social cards (#3001)
+* Improve word count behavior when generating summary (#3002)
diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py
index 710e14ed..ee5146df 100644
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@@ -230,6 +230,11 @@ class TestUtils(LoggedTestCase):
                 'Ты мелькнула, ты предстала, Снова сердце задрожало,', 3
             ),
             'Ты мелькнула, ты' + ' …')
+        self.assertEqual(
+            utils.truncate_html_words(
+                'Trong đầm gì đẹp bằng sen', 4
+            ),
+            'Trong đầm gì đẹp' + ' …')
 
         # Words enclosed or intervaled by HTML tags.
         self.assertEqual(
diff --git a/pelican/utils.py b/pelican/utils.py
index 17667078..f3a01217 100644
--- a/pelican/utils.py
+++ b/pelican/utils.py
@@ -412,10 +412,7 @@ def posixize_path(rel_path):
 
 class _HTMLWordTruncator(HTMLParser):
 
-    _word_regex = re.compile(r"(({SBC})({SBC}|-|')*)|{DBC}".format(
-        # SBC means Latin-like characters. A word contains a few characters.
-        #         ASCII |Extended Latin | Cyrillic
-        SBC="[0-9a-zA-Z]|[\u00C0-\u024f]|[\u0400-\u04FF]",
+    _word_regex = re.compile(r"{DBC}|(\w[\w'-]*)".format(
         # DBC means CJK-like characters. An character can stand for a word.
         DBC=("([\u4E00-\u9FFF])|"          # CJK Unified Ideographs
              "([\u3400-\u4DBF])|"          # CJK Unified Ideographs Extension A