1
0
Fork 0
forked from github/pelican

Rewrite pelican.utils.slugify to use unicode and add tests

Adds a use_unicode kwarg to slugify to keep unicode
characters as is (no ASCII-fying) and add tests for
it. Also reworks how slugification logic.

slugify started with the Django method for slugiying:
 - Normalize to compatibility decomposed from (NFKD)
 - Encode and decode with 'ascii'

This works fine if the decomposed form contains ASCII
characters (i.e. ç can be changed in to c+CEDILLA and
ASCII would keep c only), but fails when decomposition
doesn't result in ASCII characters (i.e. Chinese). To
solve that 'unidecode' was added, which works fine for
both cases. However, old method is now redundant but
was kept. This commit removes the old method and
adjusts logic slightly.

Now slugify will normalize all text with composition
mode (NFKC) to unify format for regex substitutions.
And then if use_unicode is False, uses unidecode to
convert it to ASCII.
This commit is contained in:
Deniz Turgut 2020-04-19 17:23:26 +03:00
commit 03d9c38871
2 changed files with 63 additions and 16 deletions

View file

@ -128,6 +128,45 @@ class TestUtils(LoggedTestCase):
self.assertEqual(
utils.slugify('Cat', regex_subs=subs, preserve_case=True), 'Cat')
def test_slugify_use_unicode(self):
samples = (
('this is a test', 'this-is-a-test'),
('this is a test', 'this-is-a-test'),
('this → is ← a ↑ test', 'this-is-a-test'),
('this--is---a test', 'this-is-a-test'),
('unicode測試許功蓋你看到了嗎', 'unicode測試許功蓋你看到了嗎'),
('Çığ', 'çığ')
)
settings = read_settings()
subs = settings['SLUG_REGEX_SUBSTITUTIONS']
for value, expected in samples:
self.assertEqual(
utils.slugify(value, regex_subs=subs, use_unicode=True),
expected)
# check with preserve case
for value, expected in samples:
self.assertEqual(
utils.slugify('Çığ', regex_subs=subs,
preserve_case=True, use_unicode=True),
'Çığ')
# check normalization
samples = (
('大飯原発4号機、18日夜起動へ', '大飯原発4号機18日夜起動へ'),
(
'\N{LATIN SMALL LETTER C}\N{COMBINING CEDILLA}',
'\N{LATIN SMALL LETTER C WITH CEDILLA}'
)
)
for value, expected in samples:
self.assertEqual(
utils.slugify(value, regex_subs=subs, use_unicode=True),
expected)
def test_slugify_substitute(self):
samples = (('C++ is based on C', 'cpp-is-based-on-c'),

View file

@ -222,7 +222,7 @@ def pelican_open(filename, mode='r', strip_crs=(sys.platform == 'win32')):
yield content
def slugify(value, regex_subs=(), preserve_case=False):
def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False):
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
@ -230,28 +230,36 @@ def slugify(value, regex_subs=(), preserve_case=False):
Took from Django sources.
"""
# TODO Maybe steal again from current Django 1.5dev
value = Markup(value).striptags()
# value must be unicode per se
import unicodedata
from unidecode import unidecode
value = unidecode(value)
if isinstance(value, bytes):
value = value.decode('ascii')
# still unicode
value = unicodedata.normalize('NFKD', value)
import unidecode
def normalize_unicode(text):
# normalize text by compatibility composition
# see: https://en.wikipedia.org/wiki/Unicode_equivalence
return unicodedata.normalize('NFKC', text)
# strip tags from value
value = Markup(value).striptags()
# normalization
value = normalize_unicode(value)
if not use_unicode:
# ASCII-fy
value = unidecode.unidecode(value)
# perform regex substitutions
for src, dst in regex_subs:
value = re.sub(src, dst, value, flags=re.IGNORECASE)
value = re.sub(
normalize_unicode(src),
normalize_unicode(dst),
value,
flags=re.IGNORECASE)
# convert to lowercase
if not preserve_case:
value = value.lower()
# we want only ASCII chars
value = value.encode('ascii', 'ignore').strip()
# but Pelican should generally use only unicode
return value.decode('ascii')
return value.strip()
def copy(source, destination, ignores=None):