Improve word count behavior when generating summary

Improve _HTMLWordTruncator by using more than one unicode block in
_word_regex, making word count function behave properly with CJK,
Cyrillic, and more Latin characters when generating summary.
This commit is contained in:
ImBearChild 2021-09-29 14:44:47 +08:00 committed by Justin Mayer
commit 22192c148a
2 changed files with 31 additions and 3 deletions

View file

@ -218,6 +218,19 @@ class TestUtils(LoggedTestCase):
utils.truncate_html_words('word ' * 100, 20),
'word ' * 20 + '')
# Plain text with Unicode content.
self.assertEqual(
utils.truncate_html_words(
'我愿意这样,朋友——我独自远行,不但没有你,\
并且再没有别的影在黑暗里', 12
),
'我愿意这样,朋友——我独自远行' + '')
self.assertEqual(
utils.truncate_html_words(
'Ты мелькнула, ты предстала, Снова сердце задрожало,', 3
),
'Ты мелькнула, ты' + '')
# Words enclosed or intervaled by HTML tags.
self.assertEqual(
utils.truncate_html_words('<p>' + 'word ' * 100 + '</p>', 20),