Merge pull request #2326 from oulenz/slug_substitutions

Control slug substitutions from settings with regex
2018-10-31 20:08:01 +01:00 · 2018-10-31 20:08:01 +01:00 · 461f535d04
commit 461f535d04
parent 96a689eaef 5199fa51ea
12 changed files with 409 additions and 235 deletions
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -6,6 +6,8 @@ Next release

 * New signal: ``feed_generated``
 * Replace Fabric by Invoke and ``fabfile.py`` template by ``tasks.py``.
+* Replace ``SLUG_SUBSTITUTIONS`` (and friends) by ``SLUG_REGEX_SUBSTITUTIONS``
+  for more finegrained control

 3.7.1 (2017-01-10)
 ==================
--- a/docs/settings.rst
+++ b/docs/settings.rst
@ -519,27 +519,6 @@ respectively.
   The URL to use for per-day archives of your posts. Used only if you have the
   ``{url}`` placeholder in ``PAGINATION_PATTERNS``.

-.. data:: SLUG_SUBSTITUTIONS = ()
-
-   Substitutions to make prior to stripping out non-alphanumerics when
-   generating slugs. Specified as a list of 3-tuples of ``(from, to, skip)``
-   which are applied in order. ``skip`` is a boolean indicating whether or not
-   to skip replacement of non-alphanumeric characters.  Useful for backward
-   compatibility with existing URLs.
-
-.. data:: AUTHOR_SUBSTITUTIONS = ()
-
-   Substitutions for authors. ``SLUG_SUBSTITUTIONS`` is not taken into account
-   here!
-
-.. data:: CATEGORY_SUBSTITUTIONS = ()
-
-   Added to ``SLUG_SUBSTITUTIONS`` for categories.
-
-.. data:: TAG_SUBSTITUTIONS = ()
-
-   Added to ``SLUG_SUBSTITUTIONS`` for tags.
-
 .. note::

    If you do not want one or more of the default pages to be created (e.g.,
@ -547,24 +526,6 @@ respectively.
    set the corresponding ``*_SAVE_AS`` setting to ``''`` to prevent the
    relevant page from being generated.

-.. note::
-
-    Substitutions are applied in order with the side effect that keeping
-    non-alphanum characters applies to the whole string when a replacement
-    is made.
-
-    For example if you have the following setting::
-
-       SLUG_SUBSTITUTIONS = (('C++', 'cpp'), ('keep dot', 'keep.dot', True))
-
-    the string ``Keep Dot`` will be converted to ``keep.dot``, however
-    ``C++ will keep dot`` will be converted to ``cpp will keep.dot`` instead
-    of ``cpp-will-keep.dot``!
-
-    If you want to keep non-alphanum characters only for tags or categories
-    but not other slugs then configure ``TAG_SUBSTITUTIONS`` and
-    ``CATEGORY_SUBSTITUTIONS`` respectively!
-
 Pelican can optionally create per-year, per-month, and per-day archives of your
 posts. These secondary archives are disabled by default but are automatically
 enabled if you supply format strings for their respective ``_SAVE_AS`` settings.
@ -626,6 +587,33 @@ URLs for direct template pages are theme-dependent. Some themes use
 corresponding ``*_URL`` setting as string, while others hard-code them:
 ``'archives.html'``, ``'authors.html'``, ``'categories.html'``, ``'tags.html'``.

+.. data:: SLUG_REGEX_SUBSTITUTIONS = [
+        (r'[^\w\s-]', ''),  # remove non-alphabetical/whitespace/'-' chars
+        (r'(?u)\A\s*', ''),  # strip leading whitespace
+        (r'(?u)\s*\Z', ''),  # strip trailing whitespace
+        (r'[-\s]+', '-'),  # reduce multiple whitespace or '-' to single '-'
+    ]
+
+   Regex substitutions to make when generating slugs of articles and pages.
+   Specified as a list of pairs of ``(from, to)`` which are applied in order,
+   ignoring case. The default substitutions have the effect of removing
+   non-alphanumeric characters and converting internal whitespace to dashes.
+   Apart from these substitutions, slugs are always converted to lowercase
+   ascii characters and leading and trailing whitespace is stripped. Useful for
+   backward compatibility with existing URLs.
+
+.. data:: AUTHOR_REGEX_SUBSTITUTIONS = SLUG_REGEX_SUBSTITUTIONS
+
+   Regex substitutions for author slugs. Defaults to ``SLUG_REGEX_SUBSTITUTIONS``.
+
+.. data:: CATEGORY_REGEX_SUBSTITUTIONS = SLUG_REGEX_SUBSTITUTIONS
+
+   Regex substitutions for category slugs. Defaults to ``SLUG_REGEX_SUBSTITUTIONS``.
+
+.. data:: TAG_REGEX_SUBSTITUTIONS = SLUG_REGEX_SUBSTITUTIONS
+
+   Regex substitutions for tag slugs. Defaults to ``SLUG_REGEX_SUBSTITUTIONS``.
+
 Time and Date
 =============

--- a/pelican/contents.py
+++ b/pelican/contents.py
@ -98,14 +98,16 @@ class Content(object):
        if not hasattr(self, 'slug'):
            if (settings['SLUGIFY_SOURCE'] == 'title' and
                    hasattr(self, 'title')):
-                self.slug = slugify(self.title,
-                                    settings.get('SLUG_SUBSTITUTIONS', ()))
+                self.slug = slugify(
+                    self.title,
+                    regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
            elif (settings['SLUGIFY_SOURCE'] == 'basename' and
                    source_path is not None):
                basename = os.path.basename(
                    os.path.splitext(source_path)[0])
                self.slug = slugify(
-                    basename, settings.get('SLUG_SUBSTITUTIONS', ()))
+                    basename,
+                    regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []))

        self.source_path = source_path

--- a/pelican/settings.py
+++ b/pelican/settings.py
@ -6,6 +6,7 @@ import inspect
 import locale
 import logging
 import os
+import re
 from os.path import isabs
 from posixpath import join as posix_join

@ -145,7 +146,12 @@ DEFAULT_CONFIG = {
    'TEMPLATE_PAGES': {},
    'TEMPLATE_EXTENSIONS': ['.html'],
    'IGNORE_FILES': ['.#*'],
-    'SLUG_SUBSTITUTIONS': (),
+    'SLUG_REGEX_SUBSTITUTIONS': [
+        (r'[^\w\s-]', ''),  # remove non-alphabetical/whitespace/'-' chars
+        (r'(?u)\A\s*', ''),  # strip leading whitespace
+        (r'(?u)\s*\Z', ''),  # strip trailing whitespace
+        (r'[-\s]+', '-'),  # reduce multiple whitespace or '-' to single '-'
+    ],
    'INTRASITE_LINK_REGEX': '[{|](?P<what>.*?)[|}]',
    'SLUGIFY_SOURCE': 'title',
    'CACHE_CONTENT': False,
@ -164,79 +170,62 @@ PYGMENTS_RST_OPTIONS = None


 def read_settings(path=None, override=None):
+    settings = override or {}
+
    if path:
-        local_settings = get_settings_from_file(path)
-        # Make the paths relative to the settings file
+        settings = dict(get_settings_from_file(path), **settings)
+
+    if settings:
+        settings = handle_deprecated_settings(settings)
+
+    if path:
+        # Make relative paths absolute
+        def getabs(maybe_relative, base_path=path):
+            if isabs(maybe_relative):
+                return maybe_relative
+            return os.path.abspath(os.path.normpath(os.path.join(
+                os.path.dirname(base_path), maybe_relative)))
+
        for p in ['PATH', 'OUTPUT_PATH', 'THEME', 'CACHE_PATH']:
-            if p in local_settings and local_settings[p] is not None \
-                    and not isabs(local_settings[p]):
-                absp = os.path.abspath(os.path.normpath(os.path.join(
-                    os.path.dirname(path), local_settings[p])))
+            if settings.get(p) is not None:
+                absp = getabs(settings[p])
+                # THEME may be a name rather than a path
                if p != 'THEME' or os.path.exists(absp):
-                    local_settings[p] = absp
+                    settings[p] = absp

-        if 'PLUGIN_PATH' in local_settings:
-            logger.warning('PLUGIN_PATH setting has been replaced by '
-                           'PLUGIN_PATHS, moving it to the new setting name.')
-            local_settings['PLUGIN_PATHS'] = local_settings['PLUGIN_PATH']
-            del local_settings['PLUGIN_PATH']
-        if 'JINJA_EXTENSIONS' in local_settings:
-            logger.warning('JINJA_EXTENSIONS setting has been deprecated, '
-                           'moving it to JINJA_ENVIRONMENT setting.')
-            local_settings['JINJA_ENVIRONMENT']['extensions'] = \
-                local_settings['JINJA_EXTENSIONS']
-            del local_settings['JINJA_EXTENSIONS']
-        if isinstance(local_settings['PLUGIN_PATHS'], six.string_types):
-            logger.warning("Defining PLUGIN_PATHS setting as string "
-                           "has been deprecated (should be a list)")
-            local_settings['PLUGIN_PATHS'] = [local_settings['PLUGIN_PATHS']]
-        elif local_settings['PLUGIN_PATHS'] is not None:
-            def getabs(path, pluginpath):
-                if isabs(pluginpath):
-                    return pluginpath
-                else:
-                    path_dirname = os.path.dirname(path)
-                    path_joined = os.path.join(path_dirname, pluginpath)
-                    path_normed = os.path.normpath(path_joined)
-                    path_absolute = os.path.abspath(path_normed)
-                    return path_absolute
+        if settings.get('PLUGIN_PATHS') is not None:
+            settings['PLUGIN_PATHS'] = [getabs(pluginpath)
+                                        for pluginpath
+                                        in settings['PLUGIN_PATHS']]

-            pluginpath_list = [getabs(path, pluginpath)
-                               for pluginpath
-                               in local_settings['PLUGIN_PATHS']]
-            local_settings['PLUGIN_PATHS'] = pluginpath_list
-    else:
-        local_settings = copy.deepcopy(DEFAULT_CONFIG)
+    settings = dict(copy.deepcopy(DEFAULT_CONFIG), **settings)
+    settings = configure_settings(settings)

-    if override:
-        local_settings.update(override)
-
-    parsed_settings = configure_settings(local_settings)
    # This is because there doesn't seem to be a way to pass extra
    # parameters to docutils directive handlers, so we have to have a
    # variable here that we'll import from within Pygments.run (see
    # rstdirectives.py) to see what the user defaults were.
    global PYGMENTS_RST_OPTIONS
-    PYGMENTS_RST_OPTIONS = parsed_settings.get('PYGMENTS_RST_OPTIONS', None)
-    return parsed_settings
+    PYGMENTS_RST_OPTIONS = settings.get('PYGMENTS_RST_OPTIONS', None)
+    return settings


-def get_settings_from_module(module=None, default_settings=DEFAULT_CONFIG):
+def get_settings_from_module(module=None):
    """Loads settings from a module, returns a dictionary."""

-    context = copy.deepcopy(default_settings)
+    context = {}
    if module is not None:
        context.update(
            (k, v) for k, v in inspect.getmembers(module) if k.isupper())
    return context


-def get_settings_from_file(path, default_settings=DEFAULT_CONFIG):
+def get_settings_from_file(path):
    """Loads settings from a file path, returning a dict."""

    name, ext = os.path.splitext(os.path.basename(path))
    module = load_source(name, path)
-    return get_settings_from_module(module, default_settings=default_settings)
+    return get_settings_from_module(module)


 def get_jinja_environment(settings):
@ -253,6 +242,149 @@ def get_jinja_environment(settings):
    return settings


+def handle_deprecated_settings(settings):
+    """Converts deprecated settings and issues warnings. Issues an exception
+    if both old and new setting is specified.
+    """
+
+    # PLUGIN_PATH -> PLUGIN_PATHS
+    if 'PLUGIN_PATH' in settings:
+        logger.warning('PLUGIN_PATH setting has been replaced by '
+                       'PLUGIN_PATHS, moving it to the new setting name.')
+        settings['PLUGIN_PATHS'] = settings['PLUGIN_PATH']
+        del settings['PLUGIN_PATH']
+
+    # PLUGIN_PATHS: str -> [str]
+    if isinstance(settings.get('PLUGIN_PATHS'), six.string_types):
+        logger.warning("Defining PLUGIN_PATHS setting as string "
+                       "has been deprecated (should be a list)")
+        settings['PLUGIN_PATHS'] = [settings['PLUGIN_PATHS']]
+
+    # JINJA_EXTENSIONS -> JINJA_ENVIRONMENT > extensions
+    if 'JINJA_EXTENSIONS' in settings:
+        logger.warning('JINJA_EXTENSIONS setting has been deprecated, '
+                       'moving it to JINJA_ENVIRONMENT setting.')
+        settings['JINJA_ENVIRONMENT']['extensions'] = \
+            settings['JINJA_EXTENSIONS']
+        del settings['JINJA_EXTENSIONS']
+
+    # {ARTICLE,PAGE}_DIR -> {ARTICLE,PAGE}_PATHS
+    for key in ['ARTICLE', 'PAGE']:
+        old_key = key + '_DIR'
+        new_key = key + '_PATHS'
+        if old_key in settings:
+            logger.warning(
+                'Deprecated setting %s, moving it to %s list',
+                old_key, new_key)
+            settings[new_key] = [settings[old_key]]   # also make a list
+            del settings[old_key]
+
+    # EXTRA_TEMPLATES_PATHS -> THEME_TEMPLATES_OVERRIDES
+    if 'EXTRA_TEMPLATES_PATHS' in settings:
+        logger.warning('EXTRA_TEMPLATES_PATHS is deprecated use '
+                       'THEME_TEMPLATES_OVERRIDES instead.')
+        if ('THEME_TEMPLATES_OVERRIDES' in settings and
+                settings['THEME_TEMPLATES_OVERRIDES']):
+            raise Exception(
+                'Setting both EXTRA_TEMPLATES_PATHS and '
+                'THEME_TEMPLATES_OVERRIDES is not permitted. Please move to '
+                'only setting THEME_TEMPLATES_OVERRIDES.')
+        settings['THEME_TEMPLATES_OVERRIDES'] = \
+            settings['EXTRA_TEMPLATES_PATHS']
+        del settings['EXTRA_TEMPLATES_PATHS']
+
+    # MD_EXTENSIONS -> MARKDOWN
+    if 'MD_EXTENSIONS' in settings:
+        logger.warning('MD_EXTENSIONS is deprecated use MARKDOWN '
+                       'instead. Falling back to the default.')
+        settings['MARKDOWN'] = DEFAULT_CONFIG['MARKDOWN']
+
+    # LESS_GENERATOR -> Webassets plugin
+    # FILES_TO_COPY -> STATIC_PATHS, EXTRA_PATH_METADATA
+    for old, new, doc in [
+            ('LESS_GENERATOR', 'the Webassets plugin', None),
+            ('FILES_TO_COPY', 'STATIC_PATHS and EXTRA_PATH_METADATA',
+                'https://github.com/getpelican/pelican/'
+                'blob/master/docs/settings.rst#path-metadata'),
+    ]:
+        if old in settings:
+            message = 'The {} setting has been removed in favor of {}'.format(
+                old, new)
+            if doc:
+                message += ', see {} for details'.format(doc)
+            logger.warning(message)
+
+    # PAGINATED_DIRECT_TEMPLATES -> PAGINATED_TEMPLATES
+    if 'PAGINATED_DIRECT_TEMPLATES' in settings:
+        message = 'The {} setting has been removed in favor of {}'.format(
+            'PAGINATED_DIRECT_TEMPLATES', 'PAGINATED_TEMPLATES')
+        logger.warning(message)
+
+        for t in settings['PAGINATED_DIRECT_TEMPLATES']:
+            if t not in settings['PAGINATED_TEMPLATES']:
+                settings['PAGINATED_TEMPLATES'][t] = None
+        del settings['PAGINATED_DIRECT_TEMPLATES']
+
+    # {SLUG,CATEGORY,TAG,AUTHOR}_SUBSTITUTIONS ->
+    # {SLUG,CATEGORY,TAG,AUTHOR}_REGEX_SUBSTITUTIONS
+    url_settings_url = \
+        'http://docs.getpelican.com/en/latest/settings.html#url-settings'
+    flavours = {'SLUG', 'CATEGORY', 'TAG', 'AUTHOR'}
+    old_values = {f: settings[f + '_SUBSTITUTIONS']
+                  for f in flavours if f + '_SUBSTITUTIONS' in settings}
+    new_values = {f: settings[f + '_REGEX_SUBSTITUTIONS']
+                  for f in flavours if f + '_REGEX_SUBSTITUTIONS' in settings}
+    if old_values and new_values:
+        raise Exception(
+            'Setting both {new_key} and {old_key} (or variants thereof) is '
+            'not permitted. Please move to only setting {new_key}.'
+            .format(old_key='SLUG_SUBSTITUTIONS',
+                    new_key='SLUG_REGEX_SUBSTITUTIONS'))
+    if old_values:
+        message = ('{} and variants thereof are deprecated and will be '
+                   'removed in the future. Please use {} and variants thereof '
+                   'instead. Check {}.'
+                   .format('SLUG_SUBSTITUTIONS', 'SLUG_REGEX_SUBSTITUTIONS',
+                           url_settings_url))
+        logger.warning(message)
+        if old_values.get('SLUG'):
+            for f in {'CATEGORY', 'TAG'}:
+                if old_values.get(f):
+                    old_values[f] = old_values['SLUG'] + old_values[f]
+            old_values['AUTHOR'] = old_values.get('AUTHOR', [])
+        for f in flavours:
+            if old_values.get(f) is not None:
+                regex_subs = []
+                # by default will replace non-alphanum characters
+                replace = True
+                for tpl in old_values[f]:
+                    try:
+                        src, dst, skip = tpl
+                        if skip:
+                            replace = False
+                    except ValueError:
+                        src, dst = tpl
+                    regex_subs.append(
+                        (re.escape(src), dst.replace('\\', r'\\')))
+
+                if replace:
+                    regex_subs += [
+                        (r'[^\w\s-]', ''),
+                        (r'(?u)\A\s*', ''),
+                        (r'(?u)\s*\Z', ''),
+                        (r'[-\s]+', '-'),
+                    ]
+                else:
+                    regex_subs += [
+                        (r'(?u)\A\s*', ''),
+                        (r'(?u)\s*\Z', ''),
+                    ]
+                settings[f + '_REGEX_SUBSTITUTIONS'] = regex_subs
+            settings.pop(f + '_SUBSTITUTIONS', None)
+
+    return settings
+
+
 def configure_settings(settings):
    """Provide optimizations, error checking, and warnings for the given
    settings.
@ -377,31 +509,6 @@ def configure_settings(settings):
        key=lambda r: r[0],
    )

-    # move {ARTICLE,PAGE}_DIR -> {ARTICLE,PAGE}_PATHS
-    for key in ['ARTICLE', 'PAGE']:
-        old_key = key + '_DIR'
-        new_key = key + '_PATHS'
-        if old_key in settings:
-            logger.warning(
-                'Deprecated setting %s, moving it to %s list',
-                old_key, new_key)
-            settings[new_key] = [settings[old_key]]   # also make a list
-            del settings[old_key]
-
-    # Deprecated warning of EXTRA_TEMPLATES_PATHS
-    if 'EXTRA_TEMPLATES_PATHS' in settings:
-        logger.warning('EXTRA_TEMPLATES_PATHS is deprecated use '
-                       'THEME_TEMPLATES_OVERRIDES instead.')
-        if ('THEME_TEMPLATES_OVERRIDES' in settings and
-                settings['THEME_TEMPLATES_OVERRIDES']):
-            raise Exception(
-                'Setting both EXTRA_TEMPLATES_PATHS and '
-                'THEME_TEMPLATES_OVERRIDES is not permitted. Please move to '
-                'only setting THEME_TEMPLATES_OVERRIDES.')
-        settings['THEME_TEMPLATES_OVERRIDES'] = \
-            settings['EXTRA_TEMPLATES_PATHS']
-        del settings['EXTRA_TEMPLATES_PATHS']
-
    # Save people from accidentally setting a string rather than a list
    path_keys = (
        'ARTICLE_EXCLUDES',
@ -425,12 +532,6 @@ def configure_settings(settings):
                           PATH_KEY)
            settings[PATH_KEY] = DEFAULT_CONFIG[PATH_KEY]

-    # Deprecated warning of MD_EXTENSIONS
-    if 'MD_EXTENSIONS' in settings:
-        logger.warning('MD_EXTENSIONS is deprecated use MARKDOWN '
-                       'instead. Falling back to the default.')
-        settings['MARKDOWN'] = DEFAULT_CONFIG['MARKDOWN']
-
    # Add {PAGE,ARTICLE}_PATHS to {ARTICLE,PAGE}_EXCLUDES
    mutually_exclusive = ('ARTICLE', 'PAGE')
    for type_1, type_2 in [mutually_exclusive, mutually_exclusive[::-1]]:
@ -443,27 +544,4 @@ def configure_settings(settings):
        except KeyError:
            continue            # setting not specified, nothing to do

-    for old, new, doc in [
-            ('LESS_GENERATOR', 'the Webassets plugin', None),
-            ('FILES_TO_COPY', 'STATIC_PATHS and EXTRA_PATH_METADATA',
-                'https://github.com/getpelican/pelican/'
-                'blob/master/docs/settings.rst#path-metadata'),
-    ]:
-        if old in settings:
-            message = 'The {} setting has been removed in favor of {}'.format(
-                old, new)
-            if doc:
-                message += ', see {} for details'.format(doc)
-            logger.warning(message)
-
-    if 'PAGINATED_DIRECT_TEMPLATES' in settings:
-        message = 'The {} setting has been removed in favor of {}'.format(
-            'PAGINATED_DIRECT_TEMPLATES', 'PAGINATED_TEMPLATES')
-        logger.warning(message)
-
-        for t in settings['PAGINATED_DIRECT_TEMPLATES']:
-            if t not in settings['PAGINATED_TEMPLATES']:
-                settings['PAGINATED_TEMPLATES'][t] = None
-        del settings['PAGINATED_DIRECT_TEMPLATES']
-
    return settings
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@ -497,7 +497,13 @@ class TestArticle(TestPage):

    def test_slugify_category_author(self):
        settings = get_settings()
-        settings['SLUG_SUBSTITUTIONS'] = [('C#', 'csharp')]
+        settings['SLUG_REGEX_SUBSTITUTIONS'] = [
+            (r'C#', 'csharp'),
+            (r'[^\w\s-]', ''),
+            (r'(?u)\A\s*', ''),
+            (r'(?u)\s*\Z', ''),
+            (r'[-\s]+', '-'),
+        ]
        settings['ARTICLE_URL'] = '{author}/{category}/{slug}/'
        settings['ARTICLE_SAVE_AS'] = '{author}/{category}/{slug}/index.html'
        article_kwargs = self._copy_page_kwargs()
@ -513,9 +519,13 @@ class TestArticle(TestPage):

    def test_slugify_with_author_substitutions(self):
        settings = get_settings()
-        settings['AUTHOR_SUBSTITUTIONS'] = [
-                                    ('Alexander Todorov', 'atodorov', False),
-                                    ('Krasimir Tsonev', 'krasimir', False),
+        settings['AUTHOR_REGEX_SUBSTITUTIONS'] = [
+            ('Alexander Todorov', 'atodorov'),
+            ('Krasimir Tsonev', 'krasimir'),
+            (r'[^\w\s-]', ''),
+            (r'(?u)\A\s*', ''),
+            (r'(?u)\s*\Z', ''),
+            (r'[-\s]+', '-'),
        ]
        settings['ARTICLE_URL'] = 'blog/{author}/{slug}/'
        settings['ARTICLE_SAVE_AS'] = 'blog/{author}/{slug}/index.html'
@ -530,7 +540,9 @@ class TestArticle(TestPage):

    def test_slugify_category_with_dots(self):
        settings = get_settings()
-        settings['CATEGORY_SUBSTITUTIONS'] = [('Fedora QA', 'fedora.qa', True)]
+        settings['CATEGORY_REGEX_SUBSTITUTIONS'] = [
+            ('Fedora QA', 'fedora.qa'),
+        ]
        settings['ARTICLE_URL'] = '{category}/{slug}/'
        article_kwargs = self._copy_page_kwargs()
        article_kwargs['metadata']['category'] = Category('Fedora QA',
@ -542,7 +554,9 @@ class TestArticle(TestPage):

    def test_slugify_tags_with_dots(self):
        settings = get_settings()
-        settings['TAG_SUBSTITUTIONS'] = [('Fedora QA', 'fedora.qa', True)]
+        settings['TAG_REGEX_SUBSTITUTIONS'] = [
+            ('Fedora QA', 'fedora.qa'),
+        ]
        settings['ARTICLE_URL'] = '{tag}/{slug}/'
        article_kwargs = self._copy_page_kwargs()
        article_kwargs['metadata']['tag'] = Tag('Fedora QA', settings)
--- a/pelican/tests/test_importer.py
+++ b/pelican/tests/test_importer.py
@ -6,6 +6,7 @@ import os
 import re
 from codecs import open

+from pelican.settings import DEFAULT_CONFIG
 from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder,
                                   unittest)
 from pelican.tools.pelican_import import (blogger2fields, build_header,
@ -133,10 +134,11 @@ class TestWordpressXmlImporter(unittest.TestCase):
        with temporary_folder() as temp:
            fnames = list(silent_f2p(test_posts, 'markdown',
                                     temp, dircat=True))
+        subs = DEFAULT_CONFIG['SLUG_REGEX_SUBSTITUTIONS']
        index = 0
        for post in test_posts:
            name = post[2]
-            category = slugify(post[5][0])
+            category = slugify(post[5][0], regex_subs=subs)
            name += '.md'
            filename = os.path.join(category, name)
            out_name = fnames[index]
@ -208,11 +210,12 @@ class TestWordpressXmlImporter(unittest.TestCase):
        with temporary_folder() as temp:
            fnames = list(silent_f2p(test_posts, 'markdown', temp,
                                     wp_custpost=True, dircat=True))
+        subs = DEFAULT_CONFIG['SLUG_REGEX_SUBSTITUTIONS']
        index = 0
        for post in test_posts:
            name = post[2]
            kind = post[8]
-            category = slugify(post[5][0])
+            category = slugify(post[5][0], regex_subs=subs)
            name += '.md'
            filename = os.path.join(kind, category, name)
            out_name = fnames[index]
--- a/pelican/tests/test_settings.py
+++ b/pelican/tests/test_settings.py
@ -9,7 +9,8 @@ from sys import platform


 from pelican.settings import (DEFAULT_CONFIG, DEFAULT_THEME,
-                              configure_settings, read_settings)
+                              configure_settings, handle_deprecated_settings,
+                              read_settings)
 from pelican.tests.support import unittest


@ -128,7 +129,7 @@ class TestSettingsConfiguration(unittest.TestCase):
        settings['ARTICLE_DIR'] = 'foo'
        settings['PAGE_DIR'] = 'bar'

-        configure_settings(settings)
+        settings = handle_deprecated_settings(settings)

        self.assertEqual(settings['ARTICLE_PATHS'], ['foo'])
        self.assertEqual(settings['PAGE_PATHS'], ['bar'])
@ -171,7 +172,7 @@ class TestSettingsConfiguration(unittest.TestCase):
        settings = self.settings
        settings['EXTRA_TEMPLATES_PATHS'] = ['/foo/bar', '/ha']

-        configure_settings(settings)
+        settings = handle_deprecated_settings(settings)

        self.assertEqual(settings['THEME_TEMPLATES_OVERRIDES'],
                         ['/foo/bar', '/ha'])
@ -181,7 +182,7 @@ class TestSettingsConfiguration(unittest.TestCase):
        settings = self.settings
        settings['PAGINATED_DIRECT_TEMPLATES'] = ['index', 'archives']
        settings['PAGINATED_TEMPLATES'] = {'index': 10, 'category': None}
-        settings = configure_settings(settings)
+        settings = handle_deprecated_settings(settings)
        self.assertEqual(settings['PAGINATED_TEMPLATES'],
                         {'index': 10, 'category': None, 'archives': None})
        self.assertNotIn('PAGINATED_DIRECT_TEMPLATES', settings)
@ -191,4 +192,82 @@ class TestSettingsConfiguration(unittest.TestCase):
        settings['EXTRA_TEMPLATES_PATHS'] = ['/ha']
        settings['THEME_TEMPLATES_OVERRIDES'] = ['/foo/bar']

-        self.assertRaises(Exception, configure_settings, settings)
+        self.assertRaises(Exception, handle_deprecated_settings, settings)
+
+    def test_slug_and_slug_regex_substitutions_exception(self):
+        settings = {}
+        settings['SLUG_REGEX_SUBSTITUTIONS'] = [('C++', 'cpp')]
+        settings['TAG_SUBSTITUTIONS'] = [('C#', 'csharp')]
+
+        self.assertRaises(Exception, handle_deprecated_settings, settings)
+
+    def test_deprecated_slug_substitutions(self):
+        default_slug_regex_subs = self.settings['SLUG_REGEX_SUBSTITUTIONS']
+
+        # If no deprecated setting is set, don't set new ones
+        settings = {}
+        settings = handle_deprecated_settings(settings)
+        self.assertNotIn('SLUG_REGEX_SUBSTITUTIONS', settings)
+        self.assertNotIn('TAG_REGEX_SUBSTITUTIONS', settings)
+        self.assertNotIn('CATEGORY_REGEX_SUBSTITUTIONS', settings)
+        self.assertNotIn('AUTHOR_REGEX_SUBSTITUTIONS', settings)
+
+        # If SLUG_SUBSTITUTIONS is set, set {SLUG, AUTHOR}_REGEX_SUBSTITUTIONS
+        # correctly, don't set {CATEGORY, TAG}_REGEX_SUBSTITUTIONS
+        settings = {}
+        settings['SLUG_SUBSTITUTIONS'] = [('C++', 'cpp')]
+        settings = handle_deprecated_settings(settings)
+        self.assertEqual(settings.get('SLUG_REGEX_SUBSTITUTIONS'),
+                         [(r'C\+\+', 'cpp')] + default_slug_regex_subs)
+        self.assertNotIn('TAG_REGEX_SUBSTITUTIONS', settings)
+        self.assertNotIn('CATEGORY_REGEX_SUBSTITUTIONS', settings)
+        self.assertEqual(settings.get('AUTHOR_REGEX_SUBSTITUTIONS'),
+                         default_slug_regex_subs)
+
+        # If {CATEGORY, TAG, AUTHOR}_SUBSTITUTIONS are set, set
+        # {CATEGORY, TAG, AUTHOR}_REGEX_SUBSTITUTIONS correctly, don't set
+        # SLUG_REGEX_SUBSTITUTIONS
+        settings = {}
+        settings['TAG_SUBSTITUTIONS'] = [('C#', 'csharp')]
+        settings['CATEGORY_SUBSTITUTIONS'] = [('C#', 'csharp')]
+        settings['AUTHOR_SUBSTITUTIONS'] = [('Alexander Todorov', 'atodorov')]
+        settings = handle_deprecated_settings(settings)
+        self.assertNotIn('SLUG_REGEX_SUBSTITUTIONS', settings)
+        self.assertEqual(settings['TAG_REGEX_SUBSTITUTIONS'],
+                         [(r'C\#', 'csharp')] + default_slug_regex_subs)
+        self.assertEqual(settings['CATEGORY_REGEX_SUBSTITUTIONS'],
+                         [(r'C\#', 'csharp')] + default_slug_regex_subs)
+        self.assertEqual(settings['AUTHOR_REGEX_SUBSTITUTIONS'],
+                         [(r'Alexander\ Todorov', 'atodorov')] +
+                         default_slug_regex_subs)
+
+        # If {SLUG, CATEGORY, TAG, AUTHOR}_SUBSTITUTIONS are set, set
+        # {SLUG, CATEGORY, TAG, AUTHOR}_REGEX_SUBSTITUTIONS correctly
+        settings = {}
+        settings['SLUG_SUBSTITUTIONS'] = [('C++', 'cpp')]
+        settings['TAG_SUBSTITUTIONS'] = [('C#', 'csharp')]
+        settings['CATEGORY_SUBSTITUTIONS'] = [('C#', 'csharp')]
+        settings['AUTHOR_SUBSTITUTIONS'] = [('Alexander Todorov', 'atodorov')]
+        settings = handle_deprecated_settings(settings)
+        self.assertEqual(settings['TAG_REGEX_SUBSTITUTIONS'],
+                         [(r'C\+\+', 'cpp')] + [(r'C\#', 'csharp')] +
+                         default_slug_regex_subs)
+        self.assertEqual(settings['CATEGORY_REGEX_SUBSTITUTIONS'],
+                         [(r'C\+\+', 'cpp')] + [(r'C\#', 'csharp')] +
+                         default_slug_regex_subs)
+        self.assertEqual(settings['AUTHOR_REGEX_SUBSTITUTIONS'],
+                         [(r'Alexander\ Todorov', 'atodorov')] +
+                         default_slug_regex_subs)
+
+        # Handle old 'skip' flags correctly
+        settings = {}
+        settings['SLUG_SUBSTITUTIONS'] = [('C++', 'cpp', True)]
+        settings['AUTHOR_SUBSTITUTIONS'] = [('Alexander Todorov', 'atodorov',
+                                             False)]
+        settings = handle_deprecated_settings(settings)
+        self.assertEqual(settings.get('SLUG_REGEX_SUBSTITUTIONS'),
+                         [(r'C\+\+', 'cpp')] +
+                         [(r'(?u)\A\s*', ''), (r'(?u)\s*\Z', '')])
+        self.assertEqual(settings['AUTHOR_REGEX_SUBSTITUTIONS'],
+                         [(r'Alexander\ Todorov', 'atodorov')] +
+                         default_slug_regex_subs)
--- a/pelican/tests/test_urlwrappers.py
+++ b/pelican/tests/test_urlwrappers.py
@ -55,30 +55,29 @@ class TestURLWrapper(unittest.TestCase):
        self.assertEqual(author, author_equal)

        cat_ascii = Category('指導書', settings={})
-        self.assertEqual(cat_ascii, u'zhi-dao-shu')
+        self.assertEqual(cat_ascii, u'zhi dao shu')

    def test_slugify_with_substitutions_and_dots(self):
-        tag = Tag('Tag Dot',
-                  settings={
-                        'TAG_SUBSTITUTIONS': [('Tag Dot', 'tag.dot', True)]
-                    })
+        tag = Tag('Tag Dot', settings={'TAG_REGEX_SUBSTITUTIONS': [
+            ('Tag Dot', 'tag.dot'),
+        ]})
        cat = Category('Category Dot',
-                       settings={
-                        'CATEGORY_SUBSTITUTIONS': (('Category Dot',
-                                                    'cat.dot',
-                                                    True),)
-                        })
+                       settings={'CATEGORY_REGEX_SUBSTITUTIONS': [
+                           ('Category Dot', 'cat.dot'),
+                       ]})

        self.assertEqual(tag.slug, 'tag.dot')
        self.assertEqual(cat.slug, 'cat.dot')

    def test_author_slug_substitutions(self):
-        settings = {
-            'AUTHOR_SUBSTITUTIONS': [
-                                    ('Alexander Todorov', 'atodorov', False),
-                                    ('Krasimir Tsonev', 'krasimir', False),
-            ]
-        }
+        settings = {'AUTHOR_REGEX_SUBSTITUTIONS': [
+            ('Alexander Todorov', 'atodorov'),
+            ('Krasimir Tsonev', 'krasimir'),
+            (r'[^\w\s-]', ''),
+            (r'(?u)\A\s*', ''),
+            (r'(?u)\s*\Z', ''),
+            (r'[-\s]+', '-'),
+        ]}

        author1 = Author('Mr. Senko', settings=settings)
        author2 = Author('Alexander Todorov', settings=settings)
--- a/pelican/tests/test_utils.py
+++ b/pelican/tests/test_utils.py
@ -119,8 +119,11 @@ class TestUtils(LoggedTestCase):
                   ('大飯原発４号機、１８日夜起動へ',
                    'da-fan-yuan-fa-4hao-ji-18ri-ye-qi-dong-he'),)

+        settings = read_settings()
+        subs = settings['SLUG_REGEX_SUBSTITUTIONS']
+
        for value, expected in samples:
-            self.assertEqual(utils.slugify(value), expected)
+            self.assertEqual(utils.slugify(value, regex_subs=subs), expected)

    def test_slugify_substitute(self):

@ -129,21 +132,27 @@ class TestUtils(LoggedTestCase):
                   ('c++, c#, C#, C++', 'cpp-c-sharp-c-sharp-cpp'),
                   ('c++-streams', 'cpp-streams'),)

-        subs = (('C++', 'CPP'), ('C#', 'C-SHARP'))
+        settings = read_settings()
+        subs = [
+            (r'C\+\+', 'CPP'),
+            (r'C#', 'C-SHARP'),
+        ] + settings['SLUG_REGEX_SUBSTITUTIONS']
        for value, expected in samples:
-            self.assertEqual(utils.slugify(value, subs), expected)
+            self.assertEqual(utils.slugify(value, regex_subs=subs), expected)

    def test_slugify_substitute_and_keeping_non_alphanum(self):

        samples = (('Fedora QA', 'fedora.qa'),
                   ('C++ is used by Fedora QA', 'cpp is used by fedora.qa'),
-                   ('C++ is based on C', 'cpp-is-based-on-c'),
-                   ('C+++ test C+ test', 'cpp-test-c-test'),)
+                   ('C++ is based on C', 'cpp is based on c'),
+                   ('C+++ test C+ test', 'cpp+ test c+ test'),)

-        subs = (('Fedora QA', 'fedora.qa', True),
-                ('c++', 'cpp'),)
+        subs = [
+            (r'Fedora QA', 'fedora.qa'),
+            (r'c\+\+', 'cpp'),
+        ]
        for value, expected in samples:
-            self.assertEqual(utils.slugify(value, subs), expected)
+            self.assertEqual(utils.slugify(value, regex_subs=subs), expected)

    def test_get_relative_path(self):

--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@ -17,6 +17,7 @@ from six.moves.urllib.request import urlretrieve

 # because logging.setLoggerClass has to be called before logging.getLogger
 from pelican.log import init
+from pelican.settings import read_settings
 from pelican.utils import SafeDatetime, slugify

 try:
@ -291,6 +292,8 @@ def dc2fields(file):

    print("%i posts read." % len(posts))

+    settings = read_settings()
+    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
    for post in posts:
        fields = post.split('","')

@ -383,8 +386,9 @@ def dc2fields(file):
        kind = 'article'  # TODO: Recognise pages
        status = 'published'  # TODO: Find a way for draft posts

-        yield (post_title, content, slugify(post_title), post_creadt, author,
-               categories, tags, status, kind, post_format)
+        yield (post_title, content, slugify(post_title, regex_subs=subs),
+               post_creadt, author, categories, tags, status, kind,
+               post_format)


 def posterous2fields(api_token, email, password):
@ -418,6 +422,8 @@ def posterous2fields(api_token, email, password):

    page = 1
    posts = get_posterous_posts(api_token, email, password, page)
+    settings = read_settings()
+    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
    while len(posts) > 0:
        posts = get_posterous_posts(api_token, email, password, page)
        page += 1
@ -425,7 +431,7 @@ def posterous2fields(api_token, email, password):
        for post in posts:
            slug = post.get('slug')
            if not slug:
-                slug = slugify(post.get('title'))
+                slug = slugify(post.get('title'), regex_subs=subs)
            tags = [tag.get('name') for tag in post.get('tags')]
            raw_date = post.get('display_date')
            date_object = SafeDatetime.strptime(
@ -469,13 +475,15 @@ def tumblr2fields(api_key, blogname):

    offset = 0
    posts = get_tumblr_posts(api_key, blogname, offset)
+    settings = read_settings()
+    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
    while len(posts) > 0:
        for post in posts:
            title = \
                post.get('title') or \
                post.get('source_title') or \
                post.get('type').capitalize()
-            slug = post.get('slug') or slugify(title)
+            slug = post.get('slug') or slugify(title, regex_subs=subs)
            tags = post.get('tags')
            timestamp = post.get('timestamp')
            date = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
@ -552,6 +560,8 @@ def feed2fields(file):
    """Read a feed and yield pelican fields"""
    import feedparser
    d = feedparser.parse(file)
+    settings = read_settings()
+    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
    for entry in d.entries:
        date = (entry.updated_parsed.strftime('%Y-%m-%d %H:%M')
                if hasattr(entry, 'updated_parsed') else None)
@ -559,7 +569,7 @@ def feed2fields(file):
        tags = ([e['term'] for e in entry.tags]
                if hasattr(entry, 'tags') else None)

-        slug = slugify(entry.title)
+        slug = slugify(entry.title, regex_subs=subs)
        kind = 'article'
        yield (entry.title, entry.description, slug, date,
               author, [], tags, None, kind, 'html')
@ -621,7 +631,7 @@ def get_ext(out_markup, in_markup='html'):


 def get_out_filename(output_path, filename, ext, kind,
-                     dirpage, dircat, categories, wp_custpost):
+                     dirpage, dircat, categories, wp_custpost, slug_subs):
    filename = os.path.basename(filename)

    # Enforce filename restrictions for various filesystems at once; see
@ -647,12 +657,12 @@ def get_out_filename(output_path, filename, ext, kind,
    # create subdirectories with category names
    elif kind != 'article':
        if wp_custpost:
-            typename = slugify(kind)
+            typename = slugify(kind, regex_subs=slug_subs)
        else:
            typename = ''
            kind = 'article'
        if dircat and (len(categories) > 0):
-            catname = slugify(categories[0])
+            catname = slugify(categories[0], regex_subs=slug_subs)
        else:
            catname = ''
        out_filename = os.path.join(output_path, typename,
@ -661,7 +671,7 @@ def get_out_filename(output_path, filename, ext, kind,
            os.makedirs(os.path.join(output_path, typename, catname))
    # option to put files in directories with categories names
    elif dircat and (len(categories) > 0):
-        catname = slugify(categories[0])
+        catname = slugify(categories[0], regex_subs=slug_subs)
        out_filename = os.path.join(output_path, catname, filename + ext)
        if not os.path.isdir(os.path.join(output_path, catname)):
            os.mkdir(os.path.join(output_path, catname))
@ -768,6 +778,9 @@ def fields2pelican(
                 'requested import action.')
        exit(error)

+    settings = read_settings()
+    slug_subs = settings['SLUG_REGEX_SUBSTITUTIONS']
+
    for (title, content, filename, date, author, categories, tags, status,
            kind, in_markup) in fields:
        if filter_author and filter_author != author:
@ -796,7 +809,7 @@ def fields2pelican(

        out_filename = get_out_filename(
            output_path, filename, ext, kind, dirpage, dircat,
-            categories, wp_custpost)
+            categories, wp_custpost, slug_subs)
        print(out_filename)

        if in_markup in ('html', 'wp-html'):
--- a/pelican/urlwrappers.py
+++ b/pelican/urlwrappers.py
@ -36,8 +36,9 @@ class URLWrapper(object):
    @property
    def slug(self):
        if self._slug is None:
-            self._slug = slugify(self.name,
-                                 self.settings.get('SLUG_SUBSTITUTIONS', ()))
+            self._slug = slugify(
+                self.name,
+                regex_subs=self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
        return self._slug

    @slug.setter
@ -56,8 +57,8 @@ class URLWrapper(object):
        return hash(self.slug)

    def _normalize_key(self, key):
-        subs = self.settings.get('SLUG_SUBSTITUTIONS', ())
-        return six.text_type(slugify(key, subs))
+        subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])
+        return six.text_type(slugify(key, regex_subs=subs))

    def __eq__(self, other):
        if isinstance(other, self.__class__):
@ -115,10 +116,11 @@ class Category(URLWrapper):
    @property
    def slug(self):
        if self._slug is None:
-            substitutions = self.settings.get('SLUG_SUBSTITUTIONS', ())
-            substitutions += tuple(self.settings.get('CATEGORY_SUBSTITUTIONS',
-                                                     ()))
-            self._slug = slugify(self.name, substitutions)
+            if 'CATEGORY_REGEX_SUBSTITUTIONS' in self.settings:
+                subs = self.settings['CATEGORY_REGEX_SUBSTITUTIONS']
+            else:
+                subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])
+            self._slug = slugify(self.name, regex_subs=subs)
        return self._slug


@ -129,9 +131,11 @@ class Tag(URLWrapper):
    @property
    def slug(self):
        if self._slug is None:
-            substitutions = self.settings.get('SLUG_SUBSTITUTIONS', ())
-            substitutions += tuple(self.settings.get('TAG_SUBSTITUTIONS', ()))
-            self._slug = slugify(self.name, substitutions)
+            if 'TAG_REGEX_SUBSTITUTIONS' in self.settings:
+                subs = self.settings['TAG_REGEX_SUBSTITUTIONS']
+            else:
+                subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])
+            self._slug = slugify(self.name, regex_subs=subs)
        return self._slug


@ -139,6 +143,9 @@ class Author(URLWrapper):
    @property
    def slug(self):
        if self._slug is None:
-            self._slug = slugify(self.name,
-                                 self.settings.get('AUTHOR_SUBSTITUTIONS', ()))
+            if 'AUTHOR_REGEX_SUBSTITUTIONS' in self.settings:
+                subs = self.settings['AUTHOR_REGEX_SUBSTITUTIONS']
+            else:
+                subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])
+            self._slug = slugify(self.name, regex_subs=subs)
        return self._slug
--- a/pelican/utils.py
+++ b/pelican/utils.py
@ -263,13 +263,14 @@ def pelican_open(filename, mode='rb', strip_crs=(sys.platform == 'win32')):
    yield content


-def slugify(value, substitutions=()):
+def slugify(value, regex_subs=()):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.

    Took from Django sources.
    """
+
    # TODO Maybe steal again from current Django 1.5dev
    value = Markup(value).striptags()
    # value must be unicode per se
@ -281,37 +282,16 @@ def slugify(value, substitutions=()):
    if isinstance(value, six.binary_type):
        value = value.decode('ascii')
    # still unicode
-    value = unicodedata.normalize('NFKD', value).lower()
+    value = unicodedata.normalize('NFKD', value)

-    # backward compatible covert from 2-tuples to 3-tuples
-    new_subs = []
-    for tpl in substitutions:
-        try:
-            src, dst, skip = tpl
-        except ValueError:
-            src, dst = tpl
-            skip = False
-        new_subs.append((src, dst, skip))
-    substitutions = tuple(new_subs)
+    for src, dst in regex_subs:
+        value = re.sub(src, dst, value, flags=re.IGNORECASE)

-    # by default will replace non-alphanum characters
-    replace = True
-    for src, dst, skip in substitutions:
-        orig_value = value
-        value = value.replace(src.lower(), dst.lower())
-        # if replacement was made then skip non-alphanum
-        # replacement if instructed to do so
-        if value != orig_value:
-            replace = replace and not skip
-
-    if replace:
-        value = re.sub(r'[^\w\s-]', '', value).strip()
-        value = re.sub(r'[-\s]+', '-', value)
-    else:
-        value = value.strip()
+    # convert to lowercase
+    value = value.lower()

    # we want only ASCII chars
-    value = value.encode('ascii', 'ignore')
+    value = value.encode('ascii', 'ignore').strip()
    # but Pelican should generally use only unicode
    return value.decode('ascii')