forked from github/pelican
Rewrite pelican.utils.slugify to use unicode and add tests
Adds a use_unicode kwarg to slugify to keep unicode characters as is (no ASCII-fying) and add tests for it. Also reworks how slugification logic. slugify started with the Django method for slugiying: - Normalize to compatibility decomposed from (NFKD) - Encode and decode with 'ascii' This works fine if the decomposed form contains ASCII characters (i.e. ç can be changed in to c+CEDILLA and ASCII would keep c only), but fails when decomposition doesn't result in ASCII characters (i.e. Chinese). To solve that 'unidecode' was added, which works fine for both cases. However, old method is now redundant but was kept. This commit removes the old method and adjusts logic slightly. Now slugify will normalize all text with composition mode (NFKC) to unify format for regex substitutions. And then if use_unicode is False, uses unidecode to convert it to ASCII.
This commit is contained in:
parent
59462ad415
commit
03d9c38871
2 changed files with 63 additions and 16 deletions
|
|
@ -128,6 +128,45 @@ class TestUtils(LoggedTestCase):
|
|||
self.assertEqual(
|
||||
utils.slugify('Cat', regex_subs=subs, preserve_case=True), 'Cat')
|
||||
|
||||
def test_slugify_use_unicode(self):
|
||||
|
||||
samples = (
|
||||
('this is a test', 'this-is-a-test'),
|
||||
('this is a test', 'this-is-a-test'),
|
||||
('this → is ← a ↑ test', 'this-is-a-test'),
|
||||
('this--is---a test', 'this-is-a-test'),
|
||||
('unicode測試許功蓋,你看到了嗎?', 'unicode測試許功蓋你看到了嗎'),
|
||||
('Çığ', 'çığ')
|
||||
)
|
||||
|
||||
settings = read_settings()
|
||||
subs = settings['SLUG_REGEX_SUBSTITUTIONS']
|
||||
|
||||
for value, expected in samples:
|
||||
self.assertEqual(
|
||||
utils.slugify(value, regex_subs=subs, use_unicode=True),
|
||||
expected)
|
||||
|
||||
# check with preserve case
|
||||
for value, expected in samples:
|
||||
self.assertEqual(
|
||||
utils.slugify('Çığ', regex_subs=subs,
|
||||
preserve_case=True, use_unicode=True),
|
||||
'Çığ')
|
||||
|
||||
# check normalization
|
||||
samples = (
|
||||
('大飯原発4号機、18日夜起動へ', '大飯原発4号機18日夜起動へ'),
|
||||
(
|
||||
'\N{LATIN SMALL LETTER C}\N{COMBINING CEDILLA}',
|
||||
'\N{LATIN SMALL LETTER C WITH CEDILLA}'
|
||||
)
|
||||
)
|
||||
for value, expected in samples:
|
||||
self.assertEqual(
|
||||
utils.slugify(value, regex_subs=subs, use_unicode=True),
|
||||
expected)
|
||||
|
||||
def test_slugify_substitute(self):
|
||||
|
||||
samples = (('C++ is based on C', 'cpp-is-based-on-c'),
|
||||
|
|
|
|||
|
|
@ -222,7 +222,7 @@ def pelican_open(filename, mode='r', strip_crs=(sys.platform == 'win32')):
|
|||
yield content
|
||||
|
||||
|
||||
def slugify(value, regex_subs=(), preserve_case=False):
|
||||
def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False):
|
||||
"""
|
||||
Normalizes string, converts to lowercase, removes non-alpha characters,
|
||||
and converts spaces to hyphens.
|
||||
|
|
@ -230,28 +230,36 @@ def slugify(value, regex_subs=(), preserve_case=False):
|
|||
Took from Django sources.
|
||||
"""
|
||||
|
||||
# TODO Maybe steal again from current Django 1.5dev
|
||||
value = Markup(value).striptags()
|
||||
# value must be unicode per se
|
||||
import unicodedata
|
||||
from unidecode import unidecode
|
||||
value = unidecode(value)
|
||||
if isinstance(value, bytes):
|
||||
value = value.decode('ascii')
|
||||
# still unicode
|
||||
value = unicodedata.normalize('NFKD', value)
|
||||
import unidecode
|
||||
|
||||
def normalize_unicode(text):
|
||||
# normalize text by compatibility composition
|
||||
# see: https://en.wikipedia.org/wiki/Unicode_equivalence
|
||||
return unicodedata.normalize('NFKC', text)
|
||||
|
||||
# strip tags from value
|
||||
value = Markup(value).striptags()
|
||||
|
||||
# normalization
|
||||
value = normalize_unicode(value)
|
||||
|
||||
if not use_unicode:
|
||||
# ASCII-fy
|
||||
value = unidecode.unidecode(value)
|
||||
|
||||
# perform regex substitutions
|
||||
for src, dst in regex_subs:
|
||||
value = re.sub(src, dst, value, flags=re.IGNORECASE)
|
||||
value = re.sub(
|
||||
normalize_unicode(src),
|
||||
normalize_unicode(dst),
|
||||
value,
|
||||
flags=re.IGNORECASE)
|
||||
|
||||
# convert to lowercase
|
||||
if not preserve_case:
|
||||
value = value.lower()
|
||||
|
||||
# we want only ASCII chars
|
||||
value = value.encode('ascii', 'ignore').strip()
|
||||
# but Pelican should generally use only unicode
|
||||
return value.decode('ascii')
|
||||
return value.strip()
|
||||
|
||||
|
||||
def copy(source, destination, ignores=None):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue