From 03d9c38871e0561c8796350d9f27bfc674027fc2 Mon Sep 17 00:00:00 2001
From: Deniz Turgut <dturgut@gmail.com>
Date: Sun, 19 Apr 2020 17:23:26 +0300
Subject: [PATCH 1/3] Rewrite pelican.utils.slugify to use unicode and add
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a use_unicode kwarg to slugify to keep unicode
characters as is (no ASCII-fying) and add tests for
it. Also reworks how slugification logic.

slugify started with the Django method for slugiying:
 - Normalize to compatibility decomposed from (NFKD)
 - Encode and decode with 'ascii'

This works fine if the decomposed form contains ASCII
characters (i.e. ç can be changed in to c+CEDILLA and
ASCII would keep c only), but fails when decomposition
doesn't result in ASCII characters (i.e. Chinese). To
solve that 'unidecode' was added, which works fine for
both cases. However, old method is now redundant but
was kept. This commit removes the old method and
adjusts logic slightly.

Now slugify will normalize all text with composition
mode (NFKC) to unify format for regex substitutions.
And then if use_unicode is False, uses unidecode to
convert it to ASCII.
---
 pelican/tests/test_utils.py | 39 ++++++++++++++++++++++++++++++++++++
 pelican/utils.py            | 40 ++++++++++++++++++++++---------------
 2 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py
index 3a8c4bd8..4a03cdc4 100644
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@@ -128,6 +128,45 @@ class TestUtils(LoggedTestCase):
         self.assertEqual(
             utils.slugify('Cat', regex_subs=subs, preserve_case=True), 'Cat')
 
+    def test_slugify_use_unicode(self):
+
+        samples = (
+            ('this is a test', 'this-is-a-test'),
+            ('this        is a test', 'this-is-a-test'),
+            ('this → is ← a ↑ test', 'this-is-a-test'),
+            ('this--is---a test', 'this-is-a-test'),
+            ('unicode測試許功蓋，你看到了嗎？', 'unicode測試許功蓋你看到了嗎'),
+            ('Çığ', 'çığ')
+        )
+
+        settings = read_settings()
+        subs = settings['SLUG_REGEX_SUBSTITUTIONS']
+
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify(value, regex_subs=subs, use_unicode=True),
+                expected)
+
+        # check with preserve case
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify('Çığ', regex_subs=subs,
+                              preserve_case=True, use_unicode=True),
+                'Çığ')
+
+        # check normalization
+        samples = (
+            ('大飯原発４号機、１８日夜起動へ', '大飯原発4号機18日夜起動へ'),
+            (
+                '\N{LATIN SMALL LETTER C}\N{COMBINING CEDILLA}',
+                '\N{LATIN SMALL LETTER C WITH CEDILLA}'
+            )
+        )
+        for value, expected in samples:
+            self.assertEqual(
+                utils.slugify(value, regex_subs=subs, use_unicode=True),
+                expected)
+
     def test_slugify_substitute(self):
 
         samples = (('C++ is based on C', 'cpp-is-based-on-c'),
diff --git a/pelican/utils.py b/pelican/utils.py
index b1536de8..c1b79ed9 100644
--- a/pelican/utils.py
+++ b/pelican/utils.py
@@ -222,7 +222,7 @@ def pelican_open(filename, mode='r', strip_crs=(sys.platform == 'win32')):
     yield content
 
 
-def slugify(value, regex_subs=(), preserve_case=False):
+def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False):
     """
     Normalizes string, converts to lowercase, removes non-alpha characters,
     and converts spaces to hyphens.
@@ -230,28 +230,36 @@ def slugify(value, regex_subs=(), preserve_case=False):
     Took from Django sources.
     """
 
-    # TODO Maybe steal again from current Django 1.5dev
-    value = Markup(value).striptags()
-    # value must be unicode per se
     import unicodedata
-    from unidecode import unidecode
-    value = unidecode(value)
-    if isinstance(value, bytes):
-        value = value.decode('ascii')
-    # still unicode
-    value = unicodedata.normalize('NFKD', value)
+    import unidecode
 
+    def normalize_unicode(text):
+        # normalize text by compatibility composition
+        # see: https://en.wikipedia.org/wiki/Unicode_equivalence
+        return unicodedata.normalize('NFKC', text)
+
+    # strip tags from value
+    value = Markup(value).striptags()
+
+    # normalization
+    value = normalize_unicode(value)
+
+    if not use_unicode:
+        # ASCII-fy
+        value = unidecode.unidecode(value)
+
+    # perform regex substitutions
     for src, dst in regex_subs:
-        value = re.sub(src, dst, value, flags=re.IGNORECASE)
+        value = re.sub(
+            normalize_unicode(src),
+            normalize_unicode(dst),
+            value,
+            flags=re.IGNORECASE)
 
-    # convert to lowercase
     if not preserve_case:
         value = value.lower()
 
-    # we want only ASCII chars
-    value = value.encode('ascii', 'ignore').strip()
-    # but Pelican should generally use only unicode
-    return value.decode('ascii')
+    return value.strip()
 
 
 def copy(source, destination, ignores=None):

From 97fe235e60a8f46346b35fa32a02ff9e1e5ba395 Mon Sep 17 00:00:00 2001
From: Deniz Turgut <dturgut@gmail.com>
Date: Sun, 19 Apr 2020 18:51:55 +0300
Subject: [PATCH 2/3] Expose use_unicode setting of slugify in settings and use
 it

---
 docs/settings.rst              | 19 +++++++++++++------
 pelican/contents.py            | 15 ++++++++-------
 pelican/settings.py            |  1 +
 pelican/tests/test_contents.py | 11 +++++++++++
 pelican/urlwrappers.py         | 26 +++++++++++++++-----------
 5 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/docs/settings.rst b/docs/settings.rst
index 48344076..60e539d3 100644
--- a/docs/settings.rst
+++ b/docs/settings.rst
@@ -320,12 +320,6 @@ Basic settings
    A list of default Pygments settings for your reStructuredText code blocks.
    See :ref:`internal_pygments_options` for a list of supported options.
 
-.. data:: SLUGIFY_SOURCE = 'title'
-
-   Specifies where you want the slug to be automatically generated from. Can be
-   set to ``title`` to use the 'Title:' metadata tag or ``basename`` to use the
-   article's file name when creating the slug.
-
 .. data:: CACHE_CONTENT = False
 
    If ``True``, saves content in caches.  See
@@ -621,6 +615,19 @@ corresponding ``*_URL`` setting as string, while others hard-code them:
 ``'archives.html'``, ``'authors.html'``, ``'categories.html'``,
 ``'tags.html'``.
 
+
+.. data:: SLUGIFY_SOURCE = 'title'
+
+   Specifies where you want the slug to be automatically generated from. Can be
+   set to ``title`` to use the 'Title:' metadata tag or ``basename`` to use the
+   article's file name when creating the slug.
+
+.. data:: SLUGIFY_USE_UNICODE = False
+
+   Allow unicode characters in slugs. Set ``True`` to keep unicode characters
+   in auto-generated slugs. Otherwise, unicode characters will be replaced
+   with ASCII equivalents.
+
 .. data:: SLUG_REGEX_SUBSTITUTIONS = [
         (r'[^\\w\\s-]', ''),  # remove non-alphabetical/whitespace/'-' chars
         (r'(?u)\\A\\s*', ''),  # strip leading whitespace
diff --git a/pelican/contents.py b/pelican/contents.py
index 40d9c28e..b49e1f2e 100644
--- a/pelican/contents.py
+++ b/pelican/contents.py
@@ -92,16 +92,17 @@ class Content(object):
         if not hasattr(self, 'slug'):
             if (settings['SLUGIFY_SOURCE'] == 'title' and
                     hasattr(self, 'title')):
-                self.slug = slugify(
-                    self.title,
-                    regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
+                value = self.title
             elif (settings['SLUGIFY_SOURCE'] == 'basename' and
                     source_path is not None):
-                basename = os.path.basename(
-                    os.path.splitext(source_path)[0])
+                value = os.path.basename(os.path.splitext(source_path)[0])
+            else:
+                value = None
+            if value is not None:
                 self.slug = slugify(
-                    basename,
-                    regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
+                    value,
+                    regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []),
+                    use_unicode=settings['SLUGIFY_USE_UNICODE'])
 
         self.source_path = source_path
         self.relative_source_path = self.get_relative_source_path()
diff --git a/pelican/settings.py b/pelican/settings.py
index 77aea059..0e0397c9 100644
--- a/pelican/settings.py
+++ b/pelican/settings.py
@@ -155,6 +155,7 @@ DEFAULT_CONFIG = {
     ],
     'INTRASITE_LINK_REGEX': '[{|](?P<what>.*?)[|}]',
     'SLUGIFY_SOURCE': 'title',
+    'SLUGIFY_USE_UNICODE': False,
     'CACHE_CONTENT': False,
     'CONTENT_CACHING_LAYER': 'reader',
     'CACHE_PATH': 'cache',
diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py
index 08d4eb73..2a377e75 100644
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@@ -135,6 +135,17 @@ class TestPage(LoggedTestCase):
         page = Page(**page_kwargs)
         self.assertEqual(page.slug, 'foo')
 
+        # test slug from unicode title
+        # slug doesn't use unicode
+        settings['SLUGIFY_SOURCE'] = "title"
+        page_kwargs['metadata']['title'] = '指導書'
+        page = Page(**page_kwargs)
+        self.assertEqual(page.slug, 'zhi-dao-shu')
+        # slug uses unicode
+        settings['SLUGIFY_USE_UNICODE'] = True
+        page = Page(**page_kwargs)
+        self.assertEqual(page.slug, '指導書')
+
     def test_defaultlang(self):
         # If no lang is given, default to the default one.
         page = Page(**self.page_kwargs)
diff --git a/pelican/urlwrappers.py b/pelican/urlwrappers.py
index cc276b3f..86f9ef32 100644
--- a/pelican/urlwrappers.py
+++ b/pelican/urlwrappers.py
@@ -34,15 +34,14 @@ class URLWrapper(object):
         if self._slug is None:
             class_key = '{}_REGEX_SUBSTITUTIONS'.format(
                 self.__class__.__name__.upper())
-            if class_key in self.settings:
-                self._slug = slugify(
-                    self.name,
-                    regex_subs=self.settings[class_key])
-            else:
-                self._slug = slugify(
-                    self.name,
-                    regex_subs=self.settings.get(
-                        'SLUG_REGEX_SUBSTITUTIONS', []))
+            regex_subs = self.settings.get(
+                class_key,
+                self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
+            self._slug = slugify(
+                self.name,
+                regex_subs=regex_subs,
+                use_unicode=self.settings.get('SLUGIFY_USE_UNICODE', False)
+            )
         return self._slug
 
     @slug.setter
@@ -61,8 +60,13 @@ class URLWrapper(object):
         return hash(self.slug)
 
     def _normalize_key(self, key):
-        subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])
-        return slugify(key, regex_subs=subs)
+        class_key = '{}_REGEX_SUBSTITUTIONS'.format(
+            self.__class__.__name__.upper())
+        regex_subs = self.settings.get(
+            class_key,
+            self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
+        use_unicode = self.settings.get('SLUGIFY_USE_UNICODE', False)
+        return slugify(key, regex_subs=regex_subs, use_unicode=use_unicode)
 
     def __eq__(self, other):
         if isinstance(other, self.__class__):

From bd699d34e88da469fa71e904e9ce3f7df6276c84 Mon Sep 17 00:00:00 2001
From: Deniz Turgut <dturgut@gmail.com>
Date: Tue, 21 Apr 2020 00:26:00 +0300
Subject: [PATCH 3/3] Expose preserve_case option from slugify

---
 docs/settings.rst              |  6 ++++++
 pelican/contents.py            |  3 ++-
 pelican/settings.py            |  1 +
 pelican/tests/test_contents.py | 35 ++++++++++++++++++++++++----------
 pelican/urlwrappers.py         |  9 ++++++++-
 5 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/docs/settings.rst b/docs/settings.rst
index 60e539d3..8c021563 100644
--- a/docs/settings.rst
+++ b/docs/settings.rst
@@ -628,6 +628,12 @@ corresponding ``*_URL`` setting as string, while others hard-code them:
    in auto-generated slugs. Otherwise, unicode characters will be replaced
    with ASCII equivalents.
 
+
+.. data:: SLUGIFY_PRESERVE_CASE = False
+
+   Preserve uppercase characters in the slugs. Set ``True`` to keep the
+   uppercase characters in the ``SLUGIFY_SOURCE`` as is.
+
 .. data:: SLUG_REGEX_SUBSTITUTIONS = [
         (r'[^\\w\\s-]', ''),  # remove non-alphabetical/whitespace/'-' chars
         (r'(?u)\\A\\s*', ''),  # strip leading whitespace
diff --git a/pelican/contents.py b/pelican/contents.py
index b49e1f2e..d01b241f 100644
--- a/pelican/contents.py
+++ b/pelican/contents.py
@@ -102,7 +102,8 @@ class Content(object):
                 self.slug = slugify(
                     value,
                     regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []),
-                    use_unicode=settings['SLUGIFY_USE_UNICODE'])
+                    preserve_case=settings.get('SLUGIFY_PRESERVE_CASE', False),
+                    use_unicode=settings.get('SLUGIFY_USE_UNICODE', False))
 
         self.source_path = source_path
         self.relative_source_path = self.get_relative_source_path()
diff --git a/pelican/settings.py b/pelican/settings.py
index 0e0397c9..85cacbf0 100644
--- a/pelican/settings.py
+++ b/pelican/settings.py
@@ -156,6 +156,7 @@ DEFAULT_CONFIG = {
     'INTRASITE_LINK_REGEX': '[{|](?P<what>.*?)[|}]',
     'SLUGIFY_SOURCE': 'title',
     'SLUGIFY_USE_UNICODE': False,
+    'SLUGIFY_PRESERVE_CASE': False,
     'CACHE_CONTENT': False,
     'CONTENT_CACHING_LAYER': 'reader',
     'CACHE_PATH': 'cache',
diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py
index 2a377e75..ebde9c3c 100644
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@@ -135,16 +135,31 @@ class TestPage(LoggedTestCase):
         page = Page(**page_kwargs)
         self.assertEqual(page.slug, 'foo')
 
-        # test slug from unicode title
-        # slug doesn't use unicode
-        settings['SLUGIFY_SOURCE'] = "title"
-        page_kwargs['metadata']['title'] = '指導書'
-        page = Page(**page_kwargs)
-        self.assertEqual(page.slug, 'zhi-dao-shu')
-        # slug uses unicode
-        settings['SLUGIFY_USE_UNICODE'] = True
-        page = Page(**page_kwargs)
-        self.assertEqual(page.slug, '指導書')
+        # test slug from title with unicode and case
+
+        inputs = (
+            # (title, expected, preserve_case, use_unicode)
+            ('指導書', 'zhi-dao-shu', False, False),
+            ('指導書', 'Zhi-Dao-Shu', True, False),
+            ('指導書', '指導書', False, True),
+            ('指導書', '指導書', True, True),
+            ('Çığ', 'cig', False, False),
+            ('Çığ', 'Cig', True, False),
+            ('Çığ', 'çığ', False, True),
+            ('Çığ', 'Çığ', True, True),
+        )
+
+        settings = get_settings()
+        page_kwargs = self._copy_page_kwargs()
+        page_kwargs['settings'] = settings
+
+        for title, expected, preserve_case, use_unicode in inputs:
+            settings['SLUGIFY_PRESERVE_CASE'] = preserve_case
+            settings['SLUGIFY_USE_UNICODE'] = use_unicode
+            page_kwargs['metadata']['title'] = title
+            page = Page(**page_kwargs)
+            self.assertEqual(page.slug, expected,
+                             (title, preserve_case, use_unicode))
 
     def test_defaultlang(self):
         # If no lang is given, default to the default one.
diff --git a/pelican/urlwrappers.py b/pelican/urlwrappers.py
index 86f9ef32..d01611ba 100644
--- a/pelican/urlwrappers.py
+++ b/pelican/urlwrappers.py
@@ -37,9 +37,11 @@ class URLWrapper(object):
             regex_subs = self.settings.get(
                 class_key,
                 self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
+            preserve_case = self.settings.get('SLUGIFY_PRESERVE_CASE', False)
             self._slug = slugify(
                 self.name,
                 regex_subs=regex_subs,
+                preserve_case=preserve_case,
                 use_unicode=self.settings.get('SLUGIFY_USE_UNICODE', False)
             )
         return self._slug
@@ -66,7 +68,12 @@ class URLWrapper(object):
             class_key,
             self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
         use_unicode = self.settings.get('SLUGIFY_USE_UNICODE', False)
-        return slugify(key, regex_subs=regex_subs, use_unicode=use_unicode)
+        preserve_case = self.settings.get('SLUGIFY_PRESERVE_CASE', False)
+        return slugify(
+            key,
+            regex_subs=regex_subs,
+            preserve_case=preserve_case,
+            use_unicode=use_unicode)
 
     def __eq__(self, other):
         if isinstance(other, self.__class__):