Allow text substitutions when generating slugs

The `slugify()` function used by Pelican is in general very good at coming up with something both readable and URL-safe. However, there are a few specific cases where it causes conflicts. One that I've run into is using the strings `C++` and `C` as tags, both of which transform to the slug `c`. This commit adds an optional `SLUG_SUBSTITUTIONS` setting which is a list of 2-tuples of substitutions to be carried out case-insensitively just prior to stripping out non-alphanumeric characters. This allows cases like `C++` to be transformed to `CPP` or similar. This can also improve the readability of slugs.
2013-06-14 15:54:06 +01:00 · 2013-06-14 15:54:06 +01:00 · 39518e15ef
commit 39518e15ef
parent 7ec4d5faa2
6 changed files with 28 additions and 8 deletions
--- a/docs/settings.rst
+++ b/docs/settings.rst
@ -258,6 +258,10 @@ Setting name (default value)                            What does it do?
                                                        posts.
 `DAY_ARCHIVE_SAVE_AS` (False)                           The location to save per-day archives of your
                                                        posts.
+`SLUG_SUBSTITUTIONS`  (``()``)                          Substitutions to make prior to stripping out
+                                                        non-alphanumerics when generating slugs. Specified
+                                                        as a list of 2-tuples of ``(from, to)`` which are
+                                                        applied in order.
 ====================================================    =====================================================

 .. note::
--- a/pelican/contents.py
+++ b/pelican/contents.py
@ -86,7 +86,8 @@ class Content(object):

        # create the slug if not existing, from the title
        if not hasattr(self, 'slug') and hasattr(self, 'title'):
-            self.slug = slugify(self.title)
+            self.slug = slugify(self.title,
+                                settings.get('SLUG_SUBSTITUTIONS', ()))

        self.source_path = source_path

--- a/pelican/settings.py
+++ b/pelican/settings.py
@ -105,6 +105,7 @@ DEFAULT_CONFIG = {
    'PLUGINS': [],
    'TEMPLATE_PAGES': {},
    'IGNORE_FILES': ['.#*'],
+    'SLUG_SUBSTITUTIONS': (),
    }

 def read_settings(path=None, override=None):
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@ -94,6 +94,17 @@ class TestUtils(LoggedTestCase):
        for value, expected in samples:
            self.assertEqual(utils.slugify(value), expected)

+    def test_slugify_substitute(self):
+
+        samples = (('C++ is based on C', 'cpp-is-based-on-c'),
+                   ('C+++ test C+ test', 'cpp-test-c-test'),
+                   ('c++, c#, C#, C++', 'cpp-c-sharp-c-sharp-cpp'),
+                   ('c++-streams', 'cpp-streams'),)
+
+        subs = (('C++', 'CPP'), ('C#', 'C-SHARP'))
+        for value, expected in samples:
+            self.assertEqual(utils.slugify(value, subs), expected)
+
    def test_get_relative_path(self):

        samples = ((os.path.join('test', 'test.html'), os.pardir),
--- a/pelican/urlwrappers.py
+++ b/pelican/urlwrappers.py
@ -15,10 +15,10 @@ class URLWrapper(object):
    def __init__(self, name, settings):
        # next 2 lines are redundant with the setter of the name property
        # but are here for clarity
-        self._name = name
-        self.slug = slugify(name)
-        self.name = name
        self.settings = settings
+        self._name = name
+        self.slug = slugify(name, self.settings.get('SLUG_SUBSTITUTIONS', ()))
+        self.name = name

    @property
    def name(self):
@ -27,7 +27,7 @@ class URLWrapper(object):
    @name.setter
    def name(self, name):
        self._name = name
-        self.slug = slugify(name)
+        self.slug = slugify(name, self.settings.get('SLUG_SUBSTITUTIONS', ()))

    def as_dict(self):
        d = self.__dict__
@ -41,7 +41,8 @@ class URLWrapper(object):
        return self.slug

    def _normalize_key(self, key):
-        return six.text_type(slugify(key))
+        subs = self.settings.get('SLUG_SUBSTITUTIONS', ())
+        return six.text_type(slugify(key, subs))

    def __eq__(self, other):
        return self._key() == self._normalize_key(other)
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -231,7 +231,7 @@ class pelican_open(object):
        pass


-def slugify(value):
+def slugify(value, substitutions=()):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
@ -249,8 +249,10 @@ def slugify(value):
    if isinstance(value, six.binary_type):
        value = value.decode('ascii')
    # still unicode
-    value = unicodedata.normalize('NFKD', value)
-    value = re.sub('[^\w\s-]', '', value).strip().lower()
+    value = unicodedata.normalize('NFKD', value).lower()
+    for src, dst in substitutions:
+        value = value.replace(src.lower(), dst.lower())
+    value = re.sub('[^\w\s-]', '', value).strip()
    value = re.sub('[-\s]+', '-', value)
    # we want only ASCII chars
    value = value.encode('ascii', 'ignore')