From d5d792060cd5e4c3ff6da82359589b990fa83f27 Mon Sep 17 00:00:00 2001 From: manhhomienbienthuy <1755796+manhhomienbienthuy@users.noreply.github.com> Date: Tue, 12 Jul 2022 02:47:37 +0900 Subject: [PATCH] Fix #2982: Improve _HTMLWordTruncator (#3002) --- RELEASE.md | 5 +++++ pelican/tests/test_utils.py | 5 +++++ pelican/utils.py | 5 +---- 3 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 RELEASE.md diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 00000000..36f865f8 --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,5 @@ +Release type: minor + +* Use JSON values for extra settings in Invoke tasks template (#2994) +* Add content tag for links, which can help with things like Twitter social cards (#3001) +* Improve word count behavior when generating summary (#3002) diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 710e14ed..ee5146df 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -230,6 +230,11 @@ class TestUtils(LoggedTestCase): 'Ты мелькнула, ты предстала, Снова сердце задрожало,', 3 ), 'Ты мелькнула, ты' + ' …') + self.assertEqual( + utils.truncate_html_words( + 'Trong đầm gì đẹp bằng sen', 4 + ), + 'Trong đầm gì đẹp' + ' …') # Words enclosed or intervaled by HTML tags. self.assertEqual( diff --git a/pelican/utils.py b/pelican/utils.py index 17667078..f3a01217 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -412,10 +412,7 @@ def posixize_path(rel_path): class _HTMLWordTruncator(HTMLParser): - _word_regex = re.compile(r"(({SBC})({SBC}|-|')*)|{DBC}".format( - # SBC means Latin-like characters. A word contains a few characters. - # ASCII |Extended Latin | Cyrillic - SBC="[0-9a-zA-Z]|[\u00C0-\u024f]|[\u0400-\u04FF]", + _word_regex = re.compile(r"{DBC}|(\w[\w'-]*)".format( # DBC means CJK-like characters. An character can stand for a word. DBC=("([\u4E00-\u9FFF])|" # CJK Unified Ideographs "([\u3400-\u4DBF])|" # CJK Unified Ideographs Extension A