From 648165b839fe9562822fa843cbdbfe224b12dfbd Mon Sep 17 00:00:00 2001
From: "Mr. Senko" <atodorov@mrsenko.com>
Date: Mon, 14 Mar 2016 00:16:58 +0200
Subject: [PATCH] More granular control of tags and categories slugs. Fixes
 #1873

- add TAG_SUBSTITUTIONS AND CATEGORY_SUBSTITURIONS settings
- make slugify keep non-alphanumeric characters if configured
---
 docs/changelog.rst                |  9 ++++++++-
 docs/settings.rst                 | 22 ++++++++++++++++++++--
 pelican/contents.py               |  1 +
 pelican/tests/test_contents.py    | 25 ++++++++++++++++++++++++-
 pelican/tests/test_urlwrappers.py | 15 +++++++++++++++
 pelican/tests/test_utils.py       | 12 ++++++++++++
 pelican/urlwrappers.py            | 17 ++++++++++++++++-
 pelican/utils.py                  | 30 +++++++++++++++++++++++++++---
 8 files changed, 123 insertions(+), 8 deletions(-)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index f52d6449..315a6b2b 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -4,7 +4,14 @@ Release history
 Next release
 ============
 
-- Nothing yet
+* ``SLUG_SUBSTITUTIONS`` now accepts 3-tuple elements, allowing to keep
+  non-alphanum characters. Existing 2-tuple configurations will continue to work
+  without change in behavior. The new 3rd parameter has side effects when there
+  are multiple substitutions defined. Plese see the docs.
+* Tag and category slugs can be controlled with greater precision using the
+  ``TAG_SUBSTITUTIONS`` and ``CATEGORY_SUBSTITUTIONS`` settings. These also
+  allow for keeping non-alphanum characters for backward compatibility with
+  existing URLs.
 
 3.6.3 (2015-08-14)
 ==================
diff --git a/docs/settings.rst b/docs/settings.rst
index 0adda992..3a8511bc 100644
--- a/docs/settings.rst
+++ b/docs/settings.rst
@@ -306,8 +306,12 @@ Setting name (followed by default value, if any)        What does it do?
 ``DAY_ARCHIVE_SAVE_AS = ''``                            The location to save per-day archives of your posts.
 ``SLUG_SUBSTITUTIONS = ()``                             Substitutions to make prior to stripping out
                                                         non-alphanumerics when generating slugs. Specified
-                                                        as a list of 2-tuples of ``(from, to)`` which are
-                                                        applied in order.
+                                                        as a list of 3-tuples of ``(from, to, skip)`` which are
+                                                        applied in order. ``skip`` is a boolean indicating whether
+                                                        or not to skip replacement of non-alphanumeric characters.
+                                                        Useful for backward compatibility with existing URLs.
+``CATEGORY_SUBSTITUTIONS = ()``                         Added to ``SLUG_SUBSTITUTIONS`` for categories.
+``TAG_SUBSTITUTIONS = ()``                              Added to ``SLUG_SUBSTITUTIONS`` for tags.
 ======================================================  ==============================================================
 
 .. note::
@@ -317,6 +321,20 @@ Setting name (followed by default value, if any)        What does it do?
     set the corresponding ``*_SAVE_AS`` setting to ``''`` to prevent the
     relevant page from being generated.
 
+.. note::
+
+    Substitutions are applied in order with the side effect that keeping
+    non-alphanum characters applies to the whole string when a replacement
+    is made. For example if you have the following setting
+    ``SLUG_SUBSTITUTIONS = (('C++', 'cpp'), ('keep dot', 'keep.dot', True))``
+    the string ``Keep Dot`` will be converted to ``keep.dot``, however
+    ``C++ will keep dot`` will be converted to ``cpp will keep.dot`` instead
+    of ``cpp-will-keep.dot``!
+    
+    If you want to keep non-alphanum characters only for tags or categories
+    but not other slugs then configure ``TAG_SUBSTITUTIONS`` and
+    ``CATEGORY_SUBSTITUTIONS`` respectively!
+
 Pelican can optionally create per-year, per-month, and per-day archives of your
 posts. These secondary archives are disabled by default but are automatically
 enabled if you supply format strings for their respective ``_SAVE_AS`` settings.
diff --git a/pelican/contents.py b/pelican/contents.py
index 0123384a..9b6aa971 100644
--- a/pelican/contents.py
+++ b/pelican/contents.py
@@ -172,6 +172,7 @@ class Content(object):
             'lang': getattr(self, 'lang', 'en'),
             'date': getattr(self, 'date', SafeDatetime.now()),
             'author': self.author.slug if hasattr(self, 'author') else '',
+            'tag': self.tag.slug if hasattr(self, 'tag') else '',
             'category': self.category.slug if hasattr(self, 'category') else ''
         })
         return metadata
diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py
index 6f0f6dd9..d62d0ed6 100644
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@@ -11,7 +11,7 @@ from jinja2.utils import generate_lorem_ipsum
 
 import six
 
-from pelican.contents import Article, Author, Category, Page, Static
+from pelican.contents import Article, Author, Category, Page, Static, Tag
 from pelican.settings import DEFAULT_CONFIG
 from pelican.signals import content_object_init
 from pelican.tests.support import LoggedTestCase, get_settings, unittest
@@ -457,6 +457,29 @@ class TestArticle(TestPage):
         self.assertEqual(
             article.save_as, 'obrien/csharp-stuff/fnord/index.html')
 
+    def test_slugify_category_with_dots(self):
+        settings = get_settings()
+        settings['CATEGORY_SUBSTITUTIONS'] = [('Fedora QA', 'fedora.qa', True)]
+        settings['ARTICLE_URL'] = '{category}/{slug}/'
+        article_kwargs = self._copy_page_kwargs()
+        article_kwargs['metadata']['category'] = Category('Fedora QA',
+                                                          settings)
+        article_kwargs['metadata']['title'] = 'This Week in Fedora QA'
+        article_kwargs['settings'] = settings
+        article = Article(**article_kwargs)
+        self.assertEqual(article.url, 'fedora.qa/this-week-in-fedora-qa/')
+
+    def test_slugify_tags_with_dots(self):
+        settings = get_settings()
+        settings['TAG_SUBSTITUTIONS'] = [('Fedora QA', 'fedora.qa', True)]
+        settings['ARTICLE_URL'] = '{tag}/{slug}/'
+        article_kwargs = self._copy_page_kwargs()
+        article_kwargs['metadata']['tag'] = Tag('Fedora QA', settings)
+        article_kwargs['metadata']['title'] = 'This Week in Fedora QA'
+        article_kwargs['settings'] = settings
+        article = Article(**article_kwargs)
+        self.assertEqual(article.url, 'fedora.qa/this-week-in-fedora-qa/')
+
 
 class TestStatic(LoggedTestCase):
 
diff --git a/pelican/tests/test_urlwrappers.py b/pelican/tests/test_urlwrappers.py
index f3dc8198..db194776 100644
--- a/pelican/tests/test_urlwrappers.py
+++ b/pelican/tests/test_urlwrappers.py
@@ -56,3 +56,18 @@ class TestURLWrapper(unittest.TestCase):
 
         cat_ascii = Category('指導書', settings={})
         self.assertEqual(cat_ascii, u'zhi-dao-shu')
+
+    def test_slugify_with_substitutions_and_dots(self):
+        tag = Tag('Tag Dot',
+                  settings={
+                        'TAG_SUBSTITUTIONS': [('Tag Dot', 'tag.dot', True)]
+                    })
+        cat = Category('Category Dot',
+                       settings={
+                        'CATEGORY_SUBSTITUTIONS': (('Category Dot',
+                                                    'cat.dot',
+                                                    True),)
+                        })
+
+        self.assertEqual(tag.slug, 'tag.dot')
+        self.assertEqual(cat.slug, 'cat.dot')
diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py
index fed7e94b..8dfc0b9b 100644
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@@ -131,6 +131,18 @@ class TestUtils(LoggedTestCase):
         for value, expected in samples:
             self.assertEqual(utils.slugify(value, subs), expected)
 
+    def test_slugify_substitute_and_keeping_non_alphanum(self):
+
+        samples = (('Fedora QA', 'fedora.qa'),
+                   ('C++ is used by Fedora QA', 'cpp is used by fedora.qa'),
+                   ('C++ is based on C', 'cpp-is-based-on-c'),
+                   ('C+++ test C+ test', 'cpp-test-c-test'),)
+
+        subs = (('Fedora QA', 'fedora.qa', True),
+                ('c++', 'cpp'),)
+        for value, expected in samples:
+            self.assertEqual(utils.slugify(value, subs), expected)
+
     def test_get_relative_path(self):
 
         samples = ((os.path.join('test', 'test.html'), os.pardir),
diff --git a/pelican/urlwrappers.py b/pelican/urlwrappers.py
index bf1199a8..e56fea8f 100644
--- a/pelican/urlwrappers.py
+++ b/pelican/urlwrappers.py
@@ -112,13 +112,28 @@ class URLWrapper(object):
 
 
 class Category(URLWrapper):
-    pass
+    @property
+    def slug(self):
+        if self._slug is None:
+            substitutions = self.settings.get('SLUG_SUBSTITUTIONS', ())
+            substitutions += tuple(self.settings.get('CATEGORY_SUBSTITUTIONS',
+                                                     ()))
+            self._slug = slugify(self.name, substitutions)
+        return self._slug
 
 
 class Tag(URLWrapper):
     def __init__(self, name, *args, **kwargs):
         super(Tag, self).__init__(name.strip(), *args, **kwargs)
 
+    @property
+    def slug(self):
+        if self._slug is None:
+            substitutions = self.settings.get('SLUG_SUBSTITUTIONS', ())
+            substitutions += tuple(self.settings.get('TAG_SUBSTITUTIONS', ()))
+            self._slug = slugify(self.name, substitutions)
+        return self._slug
+
 
 class Author(URLWrapper):
     pass
diff --git a/pelican/utils.py b/pelican/utils.py
index b5685a3b..4e729361 100644
--- a/pelican/utils.py
+++ b/pelican/utils.py
@@ -270,10 +270,34 @@ def slugify(value, substitutions=()):
         value = value.decode('ascii')
     # still unicode
     value = unicodedata.normalize('NFKD', value).lower()
-    for src, dst in substitutions:
+
+    # backward compatible covert from 2-tuples to 3-tuples
+    new_subs = []
+    for tpl in substitutions:
+        try:
+            src, dst, skip = tpl
+        except ValueError:
+            src, dst = tpl
+            skip = False
+        new_subs.append((src, dst, skip))
+    substitutions = tuple(new_subs)
+
+    # by default will replace non-alphanum characters
+    replace = True
+    for src, dst, skip in substitutions:
+        orig_value = value
         value = value.replace(src.lower(), dst.lower())
-    value = re.sub('[^\w\s-]', '', value).strip()
-    value = re.sub('[-\s]+', '-', value)
+        # if replacement was made then skip non-alphanum
+        # replacement if instructed to do so
+        if value != orig_value:
+            replace = replace and not skip
+
+    if replace:
+        value = re.sub('[^\w\s-]', '', value).strip()
+        value = re.sub('[-\s]+', '-', value)
+    else:
+        value = value.strip()
+
     # we want only ASCII chars
     value = value.encode('ascii', 'ignore')
     # but Pelican should generally use only unicode