Merge pull request #2731 from avaris/unicode-slugify

Add support for Unicode slugs
2025-10-15 20:28:56 +02:00 · 2020-04-21 05:31:51 +02:00 · 2020-04-21 05:31:51 +02:00 · 7e24886190
commit 7e24886190
parent 3031ca64c8 bd699d34e8
7 changed files with 141 additions and 40 deletions
--- a/docs/settings.rst
+++ b/docs/settings.rst
@ -320,12 +320,6 @@ Basic settings
   A list of default Pygments settings for your reStructuredText code blocks.
   See :ref:`internal_pygments_options` for a list of supported options.

-.. data:: SLUGIFY_SOURCE = 'title'
-
-   Specifies where you want the slug to be automatically generated from. Can be
-   set to ``title`` to use the 'Title:' metadata tag or ``basename`` to use the
-   article's file name when creating the slug.
-
 .. data:: CACHE_CONTENT = False

   If ``True``, saves content in caches.  See
@ -621,6 +615,25 @@ corresponding ``*_URL`` setting as string, while others hard-code them:
 ``'archives.html'``, ``'authors.html'``, ``'categories.html'``,
 ``'tags.html'``.

+
+.. data:: SLUGIFY_SOURCE = 'title'
+
+   Specifies where you want the slug to be automatically generated from. Can be
+   set to ``title`` to use the 'Title:' metadata tag or ``basename`` to use the
+   article's file name when creating the slug.
+
+.. data:: SLUGIFY_USE_UNICODE = False
+
+   Allow unicode characters in slugs. Set ``True`` to keep unicode characters
+   in auto-generated slugs. Otherwise, unicode characters will be replaced
+   with ASCII equivalents.
+
+
+.. data:: SLUGIFY_PRESERVE_CASE = False
+
+   Preserve uppercase characters in the slugs. Set ``True`` to keep the
+   uppercase characters in the ``SLUGIFY_SOURCE`` as is.
+
 .. data:: SLUG_REGEX_SUBSTITUTIONS = [
        (r'[^\\w\\s-]', ''),  # remove non-alphabetical/whitespace/'-' chars
        (r'(?u)\\A\\s*', ''),  # strip leading whitespace
--- a/pelican/contents.py
+++ b/pelican/contents.py
@ -92,16 +92,18 @@ class Content(object):
        if not hasattr(self, 'slug'):
            if (settings['SLUGIFY_SOURCE'] == 'title' and
                    hasattr(self, 'title')):
-                self.slug = slugify(
-                    self.title,
-                    regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
+                value = self.title
            elif (settings['SLUGIFY_SOURCE'] == 'basename' and
                    source_path is not None):
-                basename = os.path.basename(
-                    os.path.splitext(source_path)[0])
+                value = os.path.basename(os.path.splitext(source_path)[0])
+            else:
+                value = None
+            if value is not None:
                self.slug = slugify(
-                    basename,
-                    regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
+                    value,
+                    regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []),
+                    preserve_case=settings.get('SLUGIFY_PRESERVE_CASE', False),
+                    use_unicode=settings.get('SLUGIFY_USE_UNICODE', False))

        self.source_path = source_path
        self.relative_source_path = self.get_relative_source_path()
--- a/pelican/settings.py
+++ b/pelican/settings.py
@ -155,6 +155,8 @@ DEFAULT_CONFIG = {
    ],
    'INTRASITE_LINK_REGEX': '[{|](?P<what>.*?)[|}]',
    'SLUGIFY_SOURCE': 'title',
+    'SLUGIFY_USE_UNICODE': False,
+    'SLUGIFY_PRESERVE_CASE': False,
    'CACHE_CONTENT': False,
    'CONTENT_CACHING_LAYER': 'reader',
    'CACHE_PATH': 'cache',
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@ -135,6 +135,32 @@ class TestPage(LoggedTestCase):
        page = Page(**page_kwargs)
        self.assertEqual(page.slug, 'foo')

+        # test slug from title with unicode and case
+
+        inputs = (
+            # (title, expected, preserve_case, use_unicode)
+            ('指導書', 'zhi-dao-shu', False, False),
+            ('指導書', 'Zhi-Dao-Shu', True, False),
+            ('指導書', '指導書', False, True),
+            ('指導書', '指導書', True, True),
+            ('Çığ', 'cig', False, False),
+            ('Çığ', 'Cig', True, False),
+            ('Çığ', 'çığ', False, True),
+            ('Çığ', 'Çığ', True, True),
+        )
+
+        settings = get_settings()
+        page_kwargs = self._copy_page_kwargs()
+        page_kwargs['settings'] = settings
+
+        for title, expected, preserve_case, use_unicode in inputs:
+            settings['SLUGIFY_PRESERVE_CASE'] = preserve_case
+            settings['SLUGIFY_USE_UNICODE'] = use_unicode
+            page_kwargs['metadata']['title'] = title
+            page = Page(**page_kwargs)
+            self.assertEqual(page.slug, expected,
+                             (title, preserve_case, use_unicode))
+
    def test_defaultlang(self):
        # If no lang is given, default to the default one.
        page = Page(**self.page_kwargs)
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@ -128,6 +128,45 @@ class TestUtils(LoggedTestCase):
        self.assertEqual(
            utils.slugify('Cat', regex_subs=subs, preserve_case=True), 'Cat')

+    def test_slugify_use_unicode(self):
+
+        samples = (
+            ('this is a test', 'this-is-a-test'),
+            ('this        is a test', 'this-is-a-test'),
+            ('this → is ← a ↑ test', 'this-is-a-test'),
+            ('this--is---a test', 'this-is-a-test'),
+            ('unicode測試許功蓋，你看到了嗎？', 'unicode測試許功蓋你看到了嗎'),
+            ('Çığ', 'çığ')
+        )
+
+        settings = read_settings()
+        subs = settings['SLUG_REGEX_SUBSTITUTIONS']
+
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify(value, regex_subs=subs, use_unicode=True),
+                expected)
+
+        # check with preserve case
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify('Çığ', regex_subs=subs,
+                              preserve_case=True, use_unicode=True),
+                'Çığ')
+
+        # check normalization
+        samples = (
+            ('大飯原発４号機、１８日夜起動へ', '大飯原発4号機18日夜起動へ'),
+            (
+                '\N{LATIN SMALL LETTER C}\N{COMBINING CEDILLA}',
+                '\N{LATIN SMALL LETTER C WITH CEDILLA}'
+            )
+        )
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify(value, regex_subs=subs, use_unicode=True),
+                expected)
+
    def test_slugify_substitute(self):

        samples = (('C++ is based on C', 'cpp-is-based-on-c'),
--- a/pelican/urlwrappers.py
+++ b/pelican/urlwrappers.py
@ -34,15 +34,16 @@ class URLWrapper(object):
        if self._slug is None:
            class_key = '{}_REGEX_SUBSTITUTIONS'.format(
                self.__class__.__name__.upper())
-            if class_key in self.settings:
-                self._slug = slugify(
-                    self.name,
-                    regex_subs=self.settings[class_key])
-            else:
-                self._slug = slugify(
-                    self.name,
-                    regex_subs=self.settings.get(
-                        'SLUG_REGEX_SUBSTITUTIONS', []))
+            regex_subs = self.settings.get(
+                class_key,
+                self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
+            preserve_case = self.settings.get('SLUGIFY_PRESERVE_CASE', False)
+            self._slug = slugify(
+                self.name,
+                regex_subs=regex_subs,
+                preserve_case=preserve_case,
+                use_unicode=self.settings.get('SLUGIFY_USE_UNICODE', False)
+            )
        return self._slug

    @slug.setter
@ -61,8 +62,18 @@ class URLWrapper(object):
        return hash(self.slug)

    def _normalize_key(self, key):
-        subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])
-        return slugify(key, regex_subs=subs)
+        class_key = '{}_REGEX_SUBSTITUTIONS'.format(
+            self.__class__.__name__.upper())
+        regex_subs = self.settings.get(
+            class_key,
+            self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
+        use_unicode = self.settings.get('SLUGIFY_USE_UNICODE', False)
+        preserve_case = self.settings.get('SLUGIFY_PRESERVE_CASE', False)
+        return slugify(
+            key,
+            regex_subs=regex_subs,
+            preserve_case=preserve_case,
+            use_unicode=use_unicode)

    def __eq__(self, other):
        if isinstance(other, self.__class__):
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -222,7 +222,7 @@ def pelican_open(filename, mode='r', strip_crs=(sys.platform == 'win32')):
    yield content


-def slugify(value, regex_subs=(), preserve_case=False):
+def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
@ -230,28 +230,36 @@ def slugify(value, regex_subs=(), preserve_case=False):
    Took from Django sources.
    """

-    # TODO Maybe steal again from current Django 1.5dev
-    value = Markup(value).striptags()
-    # value must be unicode per se
    import unicodedata
-    from unidecode import unidecode
-    value = unidecode(value)
-    if isinstance(value, bytes):
-        value = value.decode('ascii')
-    # still unicode
-    value = unicodedata.normalize('NFKD', value)
+    import unidecode

+    def normalize_unicode(text):
+        # normalize text by compatibility composition
+        # see: https://en.wikipedia.org/wiki/Unicode_equivalence
+        return unicodedata.normalize('NFKC', text)
+
+    # strip tags from value
+    value = Markup(value).striptags()
+
+    # normalization
+    value = normalize_unicode(value)
+
+    if not use_unicode:
+        # ASCII-fy
+        value = unidecode.unidecode(value)
+
+    # perform regex substitutions
    for src, dst in regex_subs:
-        value = re.sub(src, dst, value, flags=re.IGNORECASE)
+        value = re.sub(
+            normalize_unicode(src),
+            normalize_unicode(dst),
+            value,
+            flags=re.IGNORECASE)

-    # convert to lowercase
    if not preserve_case:
        value = value.lower()

-    # we want only ASCII chars
-    value = value.encode('ascii', 'ignore').strip()
-    # but Pelican should generally use only unicode
-    return value.decode('ascii')
+    return value.strip()


 def copy(source, destination, ignores=None):