From 22192c148a88b30dd942d7bad2a1ab243de59bc3 Mon Sep 17 00:00:00 2001 From: ImBearChild Date: Wed, 29 Sep 2021 14:44:47 +0800 Subject: [PATCH] Improve word count behavior when generating summary Improve _HTMLWordTruncator by using more than one unicode block in _word_regex, making word count function behave properly with CJK, Cyrillic, and more Latin characters when generating summary. --- pelican/tests/test_utils.py | 13 +++++++++++++ pelican/utils.py | 21 ++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 5fbd066d..710e14ed 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -218,6 +218,19 @@ class TestUtils(LoggedTestCase): utils.truncate_html_words('word ' * 100, 20), 'word ' * 20 + '…') + # Plain text with Unicode content. + self.assertEqual( + utils.truncate_html_words( + '我愿意这样,朋友——我独自远行,不但没有你,\ + 并且再没有别的影在黑暗里。', 12 + ), + '我愿意这样,朋友——我独自远行' + ' …') + self.assertEqual( + utils.truncate_html_words( + 'Ты мелькнула, ты предстала, Снова сердце задрожало,', 3 + ), + 'Ты мелькнула, ты' + ' …') + # Words enclosed or intervaled by HTML tags. self.assertEqual( utils.truncate_html_words('

' + 'word ' * 100 + '

', 20), diff --git a/pelican/utils.py b/pelican/utils.py index b69cbf95..5065d108 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -133,6 +133,7 @@ class memoized: (not reevaluated). """ + def __init__(self, func): self.func = func self.cache = {} @@ -408,7 +409,21 @@ def posixize_path(rel_path): class _HTMLWordTruncator(HTMLParser): - _word_regex = re.compile(r"\w[\w'-]*", re.U) + _word_regex = re.compile(r"(({SBC})({SBC}|-|')*)|{DBC}".format( + # SBC means Latin-like characters. A word contains a few characters. + # ASCII |Extended Latin | Cyrillic + SBC="[0-9a-zA-Z]|[\u00C0-\u024f]|[\u0400-\u04FF]", + # DBC means CJK-like characters. An character can stand for a word. + DBC=("([\u4E00-\u9FFF])|" # CJK Unified Ideographs + "([\u3400-\u4DBF])|" # CJK Unified Ideographs Extension A + "([\uF900-\uFAFF])|" # CJK Compatibility Ideographs + "([\U00020000-\U0002A6DF])|" # CJK Unified Ideographs Extension B + "([\U0002F800-\U0002FA1F])|" # CJK Compatibility Ideographs Supplement + "([\u3040-\u30FF])|" # Hiragana and Katakana + "([\u1100-\u11FF])|" # Hangul Jamo + "([\uAC00-\uD7FF])|" # Hangul Compatibility Jamo + "([\u3130-\u318F])" # Hangul Syllables + )), re.UNICODE) _word_prefix_regex = re.compile(r'\w', re.U) _singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input') @@ -818,8 +833,8 @@ class FileSystemWatcher: } ) logger.warning( - 'No valid files found in content for the active readers:\n' - + '\n'.join(reader_descs)) + 'No valid files found in content for the active readers:\n' + + '\n'.join(reader_descs)) if result.get('theme') is None: logger.warning('Empty theme folder. Using `basic` theme.')