1
0
Fork 0
forked from github/pelican

Fix #2982: Improve _HTMLWordTruncator (#3002)

This commit is contained in:
manhhomienbienthuy 2022-07-12 02:47:37 +09:00 committed by GitHub
commit d5d792060c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 11 additions and 4 deletions

5
RELEASE.md Normal file
View file

@ -0,0 +1,5 @@
Release type: minor
* Use JSON values for extra settings in Invoke tasks template (#2994)
* Add content tag for links, which can help with things like Twitter social cards (#3001)
* Improve word count behavior when generating summary (#3002)

View file

@ -230,6 +230,11 @@ class TestUtils(LoggedTestCase):
'Ты мелькнула, ты предстала, Снова сердце задрожало,', 3
),
'Ты мелькнула, ты' + '')
self.assertEqual(
utils.truncate_html_words(
'Trong đầm gì đẹp bằng sen', 4
),
'Trong đầm gì đẹp' + '')
# Words enclosed or intervaled by HTML tags.
self.assertEqual(

View file

@ -412,10 +412,7 @@ def posixize_path(rel_path):
class _HTMLWordTruncator(HTMLParser):
_word_regex = re.compile(r"(({SBC})({SBC}|-|')*)|{DBC}".format(
# SBC means Latin-like characters. A word contains a few characters.
# ASCII |Extended Latin | Cyrillic
SBC="[0-9a-zA-Z]|[\u00C0-\u024f]|[\u0400-\u04FF]",
_word_regex = re.compile(r"{DBC}|(\w[\w'-]*)".format(
# DBC means CJK-like characters. An character can stand for a word.
DBC=("([\u4E00-\u9FFF])|" # CJK Unified Ideographs
"([\u3400-\u4DBF])|" # CJK Unified Ideographs Extension A