Rewrite pelican.utils.slugify to use unicode and add tests

Adds a use_unicode kwarg to slugify to keep unicode characters as is (no ASCII-fying) and add tests for it. Also reworks how slugification logic. slugify started with the Django method for slugiying: - Normalize to compatibility decomposed from (NFKD) - Encode and decode with 'ascii' This works fine if the decomposed form contains ASCII characters (i.e. ç can be changed in to c+CEDILLA and ASCII would keep c only), but fails when decomposition doesn't result in ASCII characters (i.e. Chinese). To solve that 'unidecode' was added, which works fine for both cases. However, old method is now redundant but was kept. This commit removes the old method and adjusts logic slightly. Now slugify will normalize all text with composition mode (NFKC) to unify format for regex substitutions. And then if use_unicode is False, uses unidecode to convert it to ASCII.
2025-10-15 20:28:56 +02:00 · 2020-04-19 17:23:26 +03:00 · 2020-04-19 17:23:26 +03:00 · 03d9c38871
commit 03d9c38871
parent 59462ad415
2 changed files with 63 additions and 16 deletions
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@ -128,6 +128,45 @@ class TestUtils(LoggedTestCase):
        self.assertEqual(
            utils.slugify('Cat', regex_subs=subs, preserve_case=True), 'Cat')

+    def test_slugify_use_unicode(self):
+
+        samples = (
+            ('this is a test', 'this-is-a-test'),
+            ('this        is a test', 'this-is-a-test'),
+            ('this → is ← a ↑ test', 'this-is-a-test'),
+            ('this--is---a test', 'this-is-a-test'),
+            ('unicode測試許功蓋，你看到了嗎？', 'unicode測試許功蓋你看到了嗎'),
+            ('Çığ', 'çığ')
+        )
+
+        settings = read_settings()
+        subs = settings['SLUG_REGEX_SUBSTITUTIONS']
+
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify(value, regex_subs=subs, use_unicode=True),
+                expected)
+
+        # check with preserve case
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify('Çığ', regex_subs=subs,
+                              preserve_case=True, use_unicode=True),
+                'Çığ')
+
+        # check normalization
+        samples = (
+            ('大飯原発４号機、１８日夜起動へ', '大飯原発4号機18日夜起動へ'),
+            (
+                '\N{LATIN SMALL LETTER C}\N{COMBINING CEDILLA}',
+                '\N{LATIN SMALL LETTER C WITH CEDILLA}'
+            )
+        )
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify(value, regex_subs=subs, use_unicode=True),
+                expected)
+
    def test_slugify_substitute(self):

        samples = (('C++ is based on C', 'cpp-is-based-on-c'),