diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 3a8c4bd8..4a03cdc4 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -128,6 +128,45 @@ class TestUtils(LoggedTestCase): self.assertEqual( utils.slugify('Cat', regex_subs=subs, preserve_case=True), 'Cat') + def test_slugify_use_unicode(self): + + samples = ( + ('this is a test', 'this-is-a-test'), + ('this is a test', 'this-is-a-test'), + ('this → is ← a ↑ test', 'this-is-a-test'), + ('this--is---a test', 'this-is-a-test'), + ('unicode測試許功蓋,你看到了嗎?', 'unicode測試許功蓋你看到了嗎'), + ('Çığ', 'çığ') + ) + + settings = read_settings() + subs = settings['SLUG_REGEX_SUBSTITUTIONS'] + + for value, expected in samples: + self.assertEqual( + utils.slugify(value, regex_subs=subs, use_unicode=True), + expected) + + # check with preserve case + for value, expected in samples: + self.assertEqual( + utils.slugify('Çığ', regex_subs=subs, + preserve_case=True, use_unicode=True), + 'Çığ') + + # check normalization + samples = ( + ('大飯原発4号機、18日夜起動へ', '大飯原発4号機18日夜起動へ'), + ( + '\N{LATIN SMALL LETTER C}\N{COMBINING CEDILLA}', + '\N{LATIN SMALL LETTER C WITH CEDILLA}' + ) + ) + for value, expected in samples: + self.assertEqual( + utils.slugify(value, regex_subs=subs, use_unicode=True), + expected) + def test_slugify_substitute(self): samples = (('C++ is based on C', 'cpp-is-based-on-c'), diff --git a/pelican/utils.py b/pelican/utils.py index b1536de8..c1b79ed9 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -222,7 +222,7 @@ def pelican_open(filename, mode='r', strip_crs=(sys.platform == 'win32')): yield content -def slugify(value, regex_subs=(), preserve_case=False): +def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False): """ Normalizes string, converts to lowercase, removes non-alpha characters, and converts spaces to hyphens. @@ -230,28 +230,36 @@ def slugify(value, regex_subs=(), preserve_case=False): Took from Django sources. """ - # TODO Maybe steal again from current Django 1.5dev - value = Markup(value).striptags() - # value must be unicode per se import unicodedata - from unidecode import unidecode - value = unidecode(value) - if isinstance(value, bytes): - value = value.decode('ascii') - # still unicode - value = unicodedata.normalize('NFKD', value) + import unidecode + def normalize_unicode(text): + # normalize text by compatibility composition + # see: https://en.wikipedia.org/wiki/Unicode_equivalence + return unicodedata.normalize('NFKC', text) + + # strip tags from value + value = Markup(value).striptags() + + # normalization + value = normalize_unicode(value) + + if not use_unicode: + # ASCII-fy + value = unidecode.unidecode(value) + + # perform regex substitutions for src, dst in regex_subs: - value = re.sub(src, dst, value, flags=re.IGNORECASE) + value = re.sub( + normalize_unicode(src), + normalize_unicode(dst), + value, + flags=re.IGNORECASE) - # convert to lowercase if not preserve_case: value = value.lower() - # we want only ASCII chars - value = value.encode('ascii', 'ignore').strip() - # but Pelican should generally use only unicode - return value.decode('ascii') + return value.strip() def copy(source, destination, ignores=None):