Fix #2982: Improve _HTMLWordTruncator

This commit is contained in:
manhhomienbienthuy 2022-05-02 22:12:23 +09:00
commit 747fec5b22
No known key found for this signature in database
GPG key ID: 290D03B85EE81F0F
3 changed files with 9 additions and 4 deletions

3
RELEASE.md Normal file
View file

@ -0,0 +1,3 @@
Release type: minor
Improve word count behavior when generating summary.

View file

@ -230,6 +230,11 @@ class TestUtils(LoggedTestCase):
'Ты мелькнула, ты предстала, Снова сердце задрожало,', 3
),
'Ты мелькнула, ты' + '')
self.assertEqual(
utils.truncate_html_words(
'Trong đầm gì đẹp bằng sen', 4
),
'Trong đầm gì đẹp' + '')
# Words enclosed or intervaled by HTML tags.
self.assertEqual(

View file

@ -412,10 +412,7 @@ def posixize_path(rel_path):
class _HTMLWordTruncator(HTMLParser):
_word_regex = re.compile(r"(({SBC})({SBC}|-|')*)|{DBC}".format(
# SBC means Latin-like characters. A word contains a few characters.
# ASCII |Extended Latin | Cyrillic
SBC="[0-9a-zA-Z]|[\u00C0-\u024f]|[\u0400-\u04FF]",
_word_regex = re.compile(r"{DBC}|(\w[\w'-]*)".format(
# DBC means CJK-like characters. An character can stand for a word.
DBC=("([\u4E00-\u9FFF])|" # CJK Unified Ideographs
"([\u3400-\u4DBF])|" # CJK Unified Ideographs Extension A