Rewrite pelican.utils.slugify to use unicode and add tests

Adds a use_unicode kwarg to slugify to keep unicode characters as is (no ASCII-fying) and add tests for it. Also reworks how slugification logic. slugify started with the Django method for slugiying: - Normalize to compatibility decomposed from (NFKD) - Encode and decode with 'ascii' This works fine if the decomposed form contains ASCII characters (i.e. ç can be changed in to c+CEDILLA and ASCII would keep c only), but fails when decomposition doesn't result in ASCII characters (i.e. Chinese). To solve that 'unidecode' was added, which works fine for both cases. However, old method is now redundant but was kept. This commit removes the old method and adjusts logic slightly. Now slugify will normalize all text with composition mode (NFKC) to unify format for regex substitutions. And then if use_unicode is False, uses unidecode to convert it to ASCII.
2020-04-19 17:23:26 +03:00 · 2020-04-19 17:23:26 +03:00 · 03d9c38871
commit 03d9c38871
parent 59462ad415
2 changed files with 63 additions and 16 deletions
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@ -128,6 +128,45 @@ class TestUtils(LoggedTestCase):
        self.assertEqual(
            utils.slugify('Cat', regex_subs=subs, preserve_case=True), 'Cat')

+    def test_slugify_use_unicode(self):
+
+        samples = (
+            ('this is a test', 'this-is-a-test'),
+            ('this        is a test', 'this-is-a-test'),
+            ('this → is ← a ↑ test', 'this-is-a-test'),
+            ('this--is---a test', 'this-is-a-test'),
+            ('unicode測試許功蓋，你看到了嗎？', 'unicode測試許功蓋你看到了嗎'),
+            ('Çığ', 'çığ')
+        )
+
+        settings = read_settings()
+        subs = settings['SLUG_REGEX_SUBSTITUTIONS']
+
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify(value, regex_subs=subs, use_unicode=True),
+                expected)
+
+        # check with preserve case
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify('Çığ', regex_subs=subs,
+                              preserve_case=True, use_unicode=True),
+                'Çığ')
+
+        # check normalization
+        samples = (
+            ('大飯原発４号機、１８日夜起動へ', '大飯原発4号機18日夜起動へ'),
+            (
+                '\N{LATIN SMALL LETTER C}\N{COMBINING CEDILLA}',
+                '\N{LATIN SMALL LETTER C WITH CEDILLA}'
+            )
+        )
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify(value, regex_subs=subs, use_unicode=True),
+                expected)
+
    def test_slugify_substitute(self):

        samples = (('C++ is based on C', 'cpp-is-based-on-c'),
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -222,7 +222,7 @@ def pelican_open(filename, mode='r', strip_crs=(sys.platform == 'win32')):
    yield content


-def slugify(value, regex_subs=(), preserve_case=False):
+def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
@ -230,28 +230,36 @@ def slugify(value, regex_subs=(), preserve_case=False):
    Took from Django sources.
    """

-    # TODO Maybe steal again from current Django 1.5dev
-    value = Markup(value).striptags()
-    # value must be unicode per se
    import unicodedata
-    from unidecode import unidecode
-    value = unidecode(value)
-    if isinstance(value, bytes):
-        value = value.decode('ascii')
-    # still unicode
-    value = unicodedata.normalize('NFKD', value)
+    import unidecode

+    def normalize_unicode(text):
+        # normalize text by compatibility composition
+        # see: https://en.wikipedia.org/wiki/Unicode_equivalence
+        return unicodedata.normalize('NFKC', text)
+
+    # strip tags from value
+    value = Markup(value).striptags()
+
+    # normalization
+    value = normalize_unicode(value)
+
+    if not use_unicode:
+        # ASCII-fy
+        value = unidecode.unidecode(value)
+
+    # perform regex substitutions
    for src, dst in regex_subs:
-        value = re.sub(src, dst, value, flags=re.IGNORECASE)
+        value = re.sub(
+            normalize_unicode(src),
+            normalize_unicode(dst),
+            value,
+            flags=re.IGNORECASE)

-    # convert to lowercase
    if not preserve_case:
        value = value.lower()

-    # we want only ASCII chars
-    value = value.encode('ascii', 'ignore').strip()
-    # but Pelican should generally use only unicode
-    return value.decode('ascii')
+    return value.strip()


 def copy(source, destination, ignores=None):