diff --git a/docs/changelog.rst b/docs/changelog.rst index 883c86d0..13ef570e 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -6,6 +6,8 @@ Next release * New signal: ``feed_generated`` * Replace Fabric by Invoke and ``fabfile.py`` template by ``tasks.py``. +* Replace ``SLUG_SUBSTITUTIONS`` (and friends) by ``SLUG_REGEX_SUBSTITUTIONS`` + for more finegrained control 3.7.1 (2017-01-10) ================== diff --git a/docs/settings.rst b/docs/settings.rst index 26f0a233..a026f914 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -519,27 +519,6 @@ respectively. The URL to use for per-day archives of your posts. Used only if you have the ``{url}`` placeholder in ``PAGINATION_PATTERNS``. -.. data:: SLUG_SUBSTITUTIONS = () - - Substitutions to make prior to stripping out non-alphanumerics when - generating slugs. Specified as a list of 3-tuples of ``(from, to, skip)`` - which are applied in order. ``skip`` is a boolean indicating whether or not - to skip replacement of non-alphanumeric characters. Useful for backward - compatibility with existing URLs. - -.. data:: AUTHOR_SUBSTITUTIONS = () - - Substitutions for authors. ``SLUG_SUBSTITUTIONS`` is not taken into account - here! - -.. data:: CATEGORY_SUBSTITUTIONS = () - - Added to ``SLUG_SUBSTITUTIONS`` for categories. - -.. data:: TAG_SUBSTITUTIONS = () - - Added to ``SLUG_SUBSTITUTIONS`` for tags. - .. note:: If you do not want one or more of the default pages to be created (e.g., @@ -547,24 +526,6 @@ respectively. set the corresponding ``*_SAVE_AS`` setting to ``''`` to prevent the relevant page from being generated. -.. note:: - - Substitutions are applied in order with the side effect that keeping - non-alphanum characters applies to the whole string when a replacement - is made. - - For example if you have the following setting:: - - SLUG_SUBSTITUTIONS = (('C++', 'cpp'), ('keep dot', 'keep.dot', True)) - - the string ``Keep Dot`` will be converted to ``keep.dot``, however - ``C++ will keep dot`` will be converted to ``cpp will keep.dot`` instead - of ``cpp-will-keep.dot``! - - If you want to keep non-alphanum characters only for tags or categories - but not other slugs then configure ``TAG_SUBSTITUTIONS`` and - ``CATEGORY_SUBSTITUTIONS`` respectively! - Pelican can optionally create per-year, per-month, and per-day archives of your posts. These secondary archives are disabled by default but are automatically enabled if you supply format strings for their respective ``_SAVE_AS`` settings. @@ -626,6 +587,33 @@ URLs for direct template pages are theme-dependent. Some themes use corresponding ``*_URL`` setting as string, while others hard-code them: ``'archives.html'``, ``'authors.html'``, ``'categories.html'``, ``'tags.html'``. +.. data:: SLUG_REGEX_SUBSTITUTIONS = [ + (r'[^\w\s-]', ''), # remove non-alphabetical/whitespace/'-' chars + (r'(?u)\A\s*', ''), # strip leading whitespace + (r'(?u)\s*\Z', ''), # strip trailing whitespace + (r'[-\s]+', '-'), # reduce multiple whitespace or '-' to single '-' + ] + + Regex substitutions to make when generating slugs of articles and pages. + Specified as a list of pairs of ``(from, to)`` which are applied in order, + ignoring case. The default substitutions have the effect of removing + non-alphanumeric characters and converting internal whitespace to dashes. + Apart from these substitutions, slugs are always converted to lowercase + ascii characters and leading and trailing whitespace is stripped. Useful for + backward compatibility with existing URLs. + +.. data:: AUTHOR_REGEX_SUBSTITUTIONS = SLUG_REGEX_SUBSTITUTIONS + + Regex substitutions for author slugs. Defaults to ``SLUG_REGEX_SUBSTITUTIONS``. + +.. data:: CATEGORY_REGEX_SUBSTITUTIONS = SLUG_REGEX_SUBSTITUTIONS + + Regex substitutions for category slugs. Defaults to ``SLUG_REGEX_SUBSTITUTIONS``. + +.. data:: TAG_REGEX_SUBSTITUTIONS = SLUG_REGEX_SUBSTITUTIONS + + Regex substitutions for tag slugs. Defaults to ``SLUG_REGEX_SUBSTITUTIONS``. + Time and Date ============= diff --git a/pelican/contents.py b/pelican/contents.py index 14dfc89b..dbda19b0 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -98,14 +98,16 @@ class Content(object): if not hasattr(self, 'slug'): if (settings['SLUGIFY_SOURCE'] == 'title' and hasattr(self, 'title')): - self.slug = slugify(self.title, - settings.get('SLUG_SUBSTITUTIONS', ())) + self.slug = slugify( + self.title, + regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) elif (settings['SLUGIFY_SOURCE'] == 'basename' and source_path is not None): basename = os.path.basename( os.path.splitext(source_path)[0]) self.slug = slugify( - basename, settings.get('SLUG_SUBSTITUTIONS', ())) + basename, + regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) self.source_path = source_path diff --git a/pelican/settings.py b/pelican/settings.py index 9845dd18..5128ff6d 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -6,6 +6,7 @@ import inspect import locale import logging import os +import re from os.path import isabs from posixpath import join as posix_join @@ -145,7 +146,12 @@ DEFAULT_CONFIG = { 'TEMPLATE_PAGES': {}, 'TEMPLATE_EXTENSIONS': ['.html'], 'IGNORE_FILES': ['.#*'], - 'SLUG_SUBSTITUTIONS': (), + 'SLUG_REGEX_SUBSTITUTIONS': [ + (r'[^\w\s-]', ''), # remove non-alphabetical/whitespace/'-' chars + (r'(?u)\A\s*', ''), # strip leading whitespace + (r'(?u)\s*\Z', ''), # strip trailing whitespace + (r'[-\s]+', '-'), # reduce multiple whitespace or '-' to single '-' + ], 'INTRASITE_LINK_REGEX': '[{|](?P.*?)[|}]', 'SLUGIFY_SOURCE': 'title', 'CACHE_CONTENT': False, @@ -164,79 +170,62 @@ PYGMENTS_RST_OPTIONS = None def read_settings(path=None, override=None): + settings = override or {} + if path: - local_settings = get_settings_from_file(path) - # Make the paths relative to the settings file + settings = dict(get_settings_from_file(path), **settings) + + if settings: + settings = handle_deprecated_settings(settings) + + if path: + # Make relative paths absolute + def getabs(maybe_relative, base_path=path): + if isabs(maybe_relative): + return maybe_relative + return os.path.abspath(os.path.normpath(os.path.join( + os.path.dirname(base_path), maybe_relative))) + for p in ['PATH', 'OUTPUT_PATH', 'THEME', 'CACHE_PATH']: - if p in local_settings and local_settings[p] is not None \ - and not isabs(local_settings[p]): - absp = os.path.abspath(os.path.normpath(os.path.join( - os.path.dirname(path), local_settings[p]))) + if settings.get(p) is not None: + absp = getabs(settings[p]) + # THEME may be a name rather than a path if p != 'THEME' or os.path.exists(absp): - local_settings[p] = absp + settings[p] = absp - if 'PLUGIN_PATH' in local_settings: - logger.warning('PLUGIN_PATH setting has been replaced by ' - 'PLUGIN_PATHS, moving it to the new setting name.') - local_settings['PLUGIN_PATHS'] = local_settings['PLUGIN_PATH'] - del local_settings['PLUGIN_PATH'] - if 'JINJA_EXTENSIONS' in local_settings: - logger.warning('JINJA_EXTENSIONS setting has been deprecated, ' - 'moving it to JINJA_ENVIRONMENT setting.') - local_settings['JINJA_ENVIRONMENT']['extensions'] = \ - local_settings['JINJA_EXTENSIONS'] - del local_settings['JINJA_EXTENSIONS'] - if isinstance(local_settings['PLUGIN_PATHS'], six.string_types): - logger.warning("Defining PLUGIN_PATHS setting as string " - "has been deprecated (should be a list)") - local_settings['PLUGIN_PATHS'] = [local_settings['PLUGIN_PATHS']] - elif local_settings['PLUGIN_PATHS'] is not None: - def getabs(path, pluginpath): - if isabs(pluginpath): - return pluginpath - else: - path_dirname = os.path.dirname(path) - path_joined = os.path.join(path_dirname, pluginpath) - path_normed = os.path.normpath(path_joined) - path_absolute = os.path.abspath(path_normed) - return path_absolute + if settings.get('PLUGIN_PATHS') is not None: + settings['PLUGIN_PATHS'] = [getabs(pluginpath) + for pluginpath + in settings['PLUGIN_PATHS']] - pluginpath_list = [getabs(path, pluginpath) - for pluginpath - in local_settings['PLUGIN_PATHS']] - local_settings['PLUGIN_PATHS'] = pluginpath_list - else: - local_settings = copy.deepcopy(DEFAULT_CONFIG) + settings = dict(copy.deepcopy(DEFAULT_CONFIG), **settings) + settings = configure_settings(settings) - if override: - local_settings.update(override) - - parsed_settings = configure_settings(local_settings) # This is because there doesn't seem to be a way to pass extra # parameters to docutils directive handlers, so we have to have a # variable here that we'll import from within Pygments.run (see # rstdirectives.py) to see what the user defaults were. global PYGMENTS_RST_OPTIONS - PYGMENTS_RST_OPTIONS = parsed_settings.get('PYGMENTS_RST_OPTIONS', None) - return parsed_settings + PYGMENTS_RST_OPTIONS = settings.get('PYGMENTS_RST_OPTIONS', None) + return settings -def get_settings_from_module(module=None, default_settings=DEFAULT_CONFIG): +def get_settings_from_module(module=None): """Loads settings from a module, returns a dictionary.""" - context = copy.deepcopy(default_settings) + context = {} if module is not None: context.update( (k, v) for k, v in inspect.getmembers(module) if k.isupper()) return context -def get_settings_from_file(path, default_settings=DEFAULT_CONFIG): +def get_settings_from_file(path): """Loads settings from a file path, returning a dict.""" name, ext = os.path.splitext(os.path.basename(path)) module = load_source(name, path) - return get_settings_from_module(module, default_settings=default_settings) + return get_settings_from_module(module) def get_jinja_environment(settings): @@ -253,6 +242,149 @@ def get_jinja_environment(settings): return settings +def handle_deprecated_settings(settings): + """Converts deprecated settings and issues warnings. Issues an exception + if both old and new setting is specified. + """ + + # PLUGIN_PATH -> PLUGIN_PATHS + if 'PLUGIN_PATH' in settings: + logger.warning('PLUGIN_PATH setting has been replaced by ' + 'PLUGIN_PATHS, moving it to the new setting name.') + settings['PLUGIN_PATHS'] = settings['PLUGIN_PATH'] + del settings['PLUGIN_PATH'] + + # PLUGIN_PATHS: str -> [str] + if isinstance(settings.get('PLUGIN_PATHS'), six.string_types): + logger.warning("Defining PLUGIN_PATHS setting as string " + "has been deprecated (should be a list)") + settings['PLUGIN_PATHS'] = [settings['PLUGIN_PATHS']] + + # JINJA_EXTENSIONS -> JINJA_ENVIRONMENT > extensions + if 'JINJA_EXTENSIONS' in settings: + logger.warning('JINJA_EXTENSIONS setting has been deprecated, ' + 'moving it to JINJA_ENVIRONMENT setting.') + settings['JINJA_ENVIRONMENT']['extensions'] = \ + settings['JINJA_EXTENSIONS'] + del settings['JINJA_EXTENSIONS'] + + # {ARTICLE,PAGE}_DIR -> {ARTICLE,PAGE}_PATHS + for key in ['ARTICLE', 'PAGE']: + old_key = key + '_DIR' + new_key = key + '_PATHS' + if old_key in settings: + logger.warning( + 'Deprecated setting %s, moving it to %s list', + old_key, new_key) + settings[new_key] = [settings[old_key]] # also make a list + del settings[old_key] + + # EXTRA_TEMPLATES_PATHS -> THEME_TEMPLATES_OVERRIDES + if 'EXTRA_TEMPLATES_PATHS' in settings: + logger.warning('EXTRA_TEMPLATES_PATHS is deprecated use ' + 'THEME_TEMPLATES_OVERRIDES instead.') + if ('THEME_TEMPLATES_OVERRIDES' in settings and + settings['THEME_TEMPLATES_OVERRIDES']): + raise Exception( + 'Setting both EXTRA_TEMPLATES_PATHS and ' + 'THEME_TEMPLATES_OVERRIDES is not permitted. Please move to ' + 'only setting THEME_TEMPLATES_OVERRIDES.') + settings['THEME_TEMPLATES_OVERRIDES'] = \ + settings['EXTRA_TEMPLATES_PATHS'] + del settings['EXTRA_TEMPLATES_PATHS'] + + # MD_EXTENSIONS -> MARKDOWN + if 'MD_EXTENSIONS' in settings: + logger.warning('MD_EXTENSIONS is deprecated use MARKDOWN ' + 'instead. Falling back to the default.') + settings['MARKDOWN'] = DEFAULT_CONFIG['MARKDOWN'] + + # LESS_GENERATOR -> Webassets plugin + # FILES_TO_COPY -> STATIC_PATHS, EXTRA_PATH_METADATA + for old, new, doc in [ + ('LESS_GENERATOR', 'the Webassets plugin', None), + ('FILES_TO_COPY', 'STATIC_PATHS and EXTRA_PATH_METADATA', + 'https://github.com/getpelican/pelican/' + 'blob/master/docs/settings.rst#path-metadata'), + ]: + if old in settings: + message = 'The {} setting has been removed in favor of {}'.format( + old, new) + if doc: + message += ', see {} for details'.format(doc) + logger.warning(message) + + # PAGINATED_DIRECT_TEMPLATES -> PAGINATED_TEMPLATES + if 'PAGINATED_DIRECT_TEMPLATES' in settings: + message = 'The {} setting has been removed in favor of {}'.format( + 'PAGINATED_DIRECT_TEMPLATES', 'PAGINATED_TEMPLATES') + logger.warning(message) + + for t in settings['PAGINATED_DIRECT_TEMPLATES']: + if t not in settings['PAGINATED_TEMPLATES']: + settings['PAGINATED_TEMPLATES'][t] = None + del settings['PAGINATED_DIRECT_TEMPLATES'] + + # {SLUG,CATEGORY,TAG,AUTHOR}_SUBSTITUTIONS -> + # {SLUG,CATEGORY,TAG,AUTHOR}_REGEX_SUBSTITUTIONS + url_settings_url = \ + 'http://docs.getpelican.com/en/latest/settings.html#url-settings' + flavours = {'SLUG', 'CATEGORY', 'TAG', 'AUTHOR'} + old_values = {f: settings[f + '_SUBSTITUTIONS'] + for f in flavours if f + '_SUBSTITUTIONS' in settings} + new_values = {f: settings[f + '_REGEX_SUBSTITUTIONS'] + for f in flavours if f + '_REGEX_SUBSTITUTIONS' in settings} + if old_values and new_values: + raise Exception( + 'Setting both {new_key} and {old_key} (or variants thereof) is ' + 'not permitted. Please move to only setting {new_key}.' + .format(old_key='SLUG_SUBSTITUTIONS', + new_key='SLUG_REGEX_SUBSTITUTIONS')) + if old_values: + message = ('{} and variants thereof are deprecated and will be ' + 'removed in the future. Please use {} and variants thereof ' + 'instead. Check {}.' + .format('SLUG_SUBSTITUTIONS', 'SLUG_REGEX_SUBSTITUTIONS', + url_settings_url)) + logger.warning(message) + if old_values.get('SLUG'): + for f in {'CATEGORY', 'TAG'}: + if old_values.get(f): + old_values[f] = old_values['SLUG'] + old_values[f] + old_values['AUTHOR'] = old_values.get('AUTHOR', []) + for f in flavours: + if old_values.get(f) is not None: + regex_subs = [] + # by default will replace non-alphanum characters + replace = True + for tpl in old_values[f]: + try: + src, dst, skip = tpl + if skip: + replace = False + except ValueError: + src, dst = tpl + regex_subs.append( + (re.escape(src), dst.replace('\\', r'\\'))) + + if replace: + regex_subs += [ + (r'[^\w\s-]', ''), + (r'(?u)\A\s*', ''), + (r'(?u)\s*\Z', ''), + (r'[-\s]+', '-'), + ] + else: + regex_subs += [ + (r'(?u)\A\s*', ''), + (r'(?u)\s*\Z', ''), + ] + settings[f + '_REGEX_SUBSTITUTIONS'] = regex_subs + settings.pop(f + '_SUBSTITUTIONS', None) + + return settings + + def configure_settings(settings): """Provide optimizations, error checking, and warnings for the given settings. @@ -377,31 +509,6 @@ def configure_settings(settings): key=lambda r: r[0], ) - # move {ARTICLE,PAGE}_DIR -> {ARTICLE,PAGE}_PATHS - for key in ['ARTICLE', 'PAGE']: - old_key = key + '_DIR' - new_key = key + '_PATHS' - if old_key in settings: - logger.warning( - 'Deprecated setting %s, moving it to %s list', - old_key, new_key) - settings[new_key] = [settings[old_key]] # also make a list - del settings[old_key] - - # Deprecated warning of EXTRA_TEMPLATES_PATHS - if 'EXTRA_TEMPLATES_PATHS' in settings: - logger.warning('EXTRA_TEMPLATES_PATHS is deprecated use ' - 'THEME_TEMPLATES_OVERRIDES instead.') - if ('THEME_TEMPLATES_OVERRIDES' in settings and - settings['THEME_TEMPLATES_OVERRIDES']): - raise Exception( - 'Setting both EXTRA_TEMPLATES_PATHS and ' - 'THEME_TEMPLATES_OVERRIDES is not permitted. Please move to ' - 'only setting THEME_TEMPLATES_OVERRIDES.') - settings['THEME_TEMPLATES_OVERRIDES'] = \ - settings['EXTRA_TEMPLATES_PATHS'] - del settings['EXTRA_TEMPLATES_PATHS'] - # Save people from accidentally setting a string rather than a list path_keys = ( 'ARTICLE_EXCLUDES', @@ -425,12 +532,6 @@ def configure_settings(settings): PATH_KEY) settings[PATH_KEY] = DEFAULT_CONFIG[PATH_KEY] - # Deprecated warning of MD_EXTENSIONS - if 'MD_EXTENSIONS' in settings: - logger.warning('MD_EXTENSIONS is deprecated use MARKDOWN ' - 'instead. Falling back to the default.') - settings['MARKDOWN'] = DEFAULT_CONFIG['MARKDOWN'] - # Add {PAGE,ARTICLE}_PATHS to {ARTICLE,PAGE}_EXCLUDES mutually_exclusive = ('ARTICLE', 'PAGE') for type_1, type_2 in [mutually_exclusive, mutually_exclusive[::-1]]: @@ -443,27 +544,4 @@ def configure_settings(settings): except KeyError: continue # setting not specified, nothing to do - for old, new, doc in [ - ('LESS_GENERATOR', 'the Webassets plugin', None), - ('FILES_TO_COPY', 'STATIC_PATHS and EXTRA_PATH_METADATA', - 'https://github.com/getpelican/pelican/' - 'blob/master/docs/settings.rst#path-metadata'), - ]: - if old in settings: - message = 'The {} setting has been removed in favor of {}'.format( - old, new) - if doc: - message += ', see {} for details'.format(doc) - logger.warning(message) - - if 'PAGINATED_DIRECT_TEMPLATES' in settings: - message = 'The {} setting has been removed in favor of {}'.format( - 'PAGINATED_DIRECT_TEMPLATES', 'PAGINATED_TEMPLATES') - logger.warning(message) - - for t in settings['PAGINATED_DIRECT_TEMPLATES']: - if t not in settings['PAGINATED_TEMPLATES']: - settings['PAGINATED_TEMPLATES'][t] = None - del settings['PAGINATED_DIRECT_TEMPLATES'] - return settings diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index 8ab60553..032df468 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -497,7 +497,13 @@ class TestArticle(TestPage): def test_slugify_category_author(self): settings = get_settings() - settings['SLUG_SUBSTITUTIONS'] = [('C#', 'csharp')] + settings['SLUG_REGEX_SUBSTITUTIONS'] = [ + (r'C#', 'csharp'), + (r'[^\w\s-]', ''), + (r'(?u)\A\s*', ''), + (r'(?u)\s*\Z', ''), + (r'[-\s]+', '-'), + ] settings['ARTICLE_URL'] = '{author}/{category}/{slug}/' settings['ARTICLE_SAVE_AS'] = '{author}/{category}/{slug}/index.html' article_kwargs = self._copy_page_kwargs() @@ -513,9 +519,13 @@ class TestArticle(TestPage): def test_slugify_with_author_substitutions(self): settings = get_settings() - settings['AUTHOR_SUBSTITUTIONS'] = [ - ('Alexander Todorov', 'atodorov', False), - ('Krasimir Tsonev', 'krasimir', False), + settings['AUTHOR_REGEX_SUBSTITUTIONS'] = [ + ('Alexander Todorov', 'atodorov'), + ('Krasimir Tsonev', 'krasimir'), + (r'[^\w\s-]', ''), + (r'(?u)\A\s*', ''), + (r'(?u)\s*\Z', ''), + (r'[-\s]+', '-'), ] settings['ARTICLE_URL'] = 'blog/{author}/{slug}/' settings['ARTICLE_SAVE_AS'] = 'blog/{author}/{slug}/index.html' @@ -530,7 +540,9 @@ class TestArticle(TestPage): def test_slugify_category_with_dots(self): settings = get_settings() - settings['CATEGORY_SUBSTITUTIONS'] = [('Fedora QA', 'fedora.qa', True)] + settings['CATEGORY_REGEX_SUBSTITUTIONS'] = [ + ('Fedora QA', 'fedora.qa'), + ] settings['ARTICLE_URL'] = '{category}/{slug}/' article_kwargs = self._copy_page_kwargs() article_kwargs['metadata']['category'] = Category('Fedora QA', @@ -542,7 +554,9 @@ class TestArticle(TestPage): def test_slugify_tags_with_dots(self): settings = get_settings() - settings['TAG_SUBSTITUTIONS'] = [('Fedora QA', 'fedora.qa', True)] + settings['TAG_REGEX_SUBSTITUTIONS'] = [ + ('Fedora QA', 'fedora.qa'), + ] settings['ARTICLE_URL'] = '{tag}/{slug}/' article_kwargs = self._copy_page_kwargs() article_kwargs['metadata']['tag'] = Tag('Fedora QA', settings) diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index f913562f..2cf05ba8 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -6,6 +6,7 @@ import os import re from codecs import open +from pelican.settings import DEFAULT_CONFIG from pelican.tests.support import (mute, skipIfNoExecutable, temporary_folder, unittest) from pelican.tools.pelican_import import (blogger2fields, build_header, @@ -133,10 +134,11 @@ class TestWordpressXmlImporter(unittest.TestCase): with temporary_folder() as temp: fnames = list(silent_f2p(test_posts, 'markdown', temp, dircat=True)) + subs = DEFAULT_CONFIG['SLUG_REGEX_SUBSTITUTIONS'] index = 0 for post in test_posts: name = post[2] - category = slugify(post[5][0]) + category = slugify(post[5][0], regex_subs=subs) name += '.md' filename = os.path.join(category, name) out_name = fnames[index] @@ -208,11 +210,12 @@ class TestWordpressXmlImporter(unittest.TestCase): with temporary_folder() as temp: fnames = list(silent_f2p(test_posts, 'markdown', temp, wp_custpost=True, dircat=True)) + subs = DEFAULT_CONFIG['SLUG_REGEX_SUBSTITUTIONS'] index = 0 for post in test_posts: name = post[2] kind = post[8] - category = slugify(post[5][0]) + category = slugify(post[5][0], regex_subs=subs) name += '.md' filename = os.path.join(kind, category, name) out_name = fnames[index] diff --git a/pelican/tests/test_settings.py b/pelican/tests/test_settings.py index 393c4337..5e794dc5 100644 --- a/pelican/tests/test_settings.py +++ b/pelican/tests/test_settings.py @@ -9,7 +9,8 @@ from sys import platform from pelican.settings import (DEFAULT_CONFIG, DEFAULT_THEME, - configure_settings, read_settings) + configure_settings, handle_deprecated_settings, + read_settings) from pelican.tests.support import unittest @@ -128,7 +129,7 @@ class TestSettingsConfiguration(unittest.TestCase): settings['ARTICLE_DIR'] = 'foo' settings['PAGE_DIR'] = 'bar' - configure_settings(settings) + settings = handle_deprecated_settings(settings) self.assertEqual(settings['ARTICLE_PATHS'], ['foo']) self.assertEqual(settings['PAGE_PATHS'], ['bar']) @@ -171,7 +172,7 @@ class TestSettingsConfiguration(unittest.TestCase): settings = self.settings settings['EXTRA_TEMPLATES_PATHS'] = ['/foo/bar', '/ha'] - configure_settings(settings) + settings = handle_deprecated_settings(settings) self.assertEqual(settings['THEME_TEMPLATES_OVERRIDES'], ['/foo/bar', '/ha']) @@ -181,7 +182,7 @@ class TestSettingsConfiguration(unittest.TestCase): settings = self.settings settings['PAGINATED_DIRECT_TEMPLATES'] = ['index', 'archives'] settings['PAGINATED_TEMPLATES'] = {'index': 10, 'category': None} - settings = configure_settings(settings) + settings = handle_deprecated_settings(settings) self.assertEqual(settings['PAGINATED_TEMPLATES'], {'index': 10, 'category': None, 'archives': None}) self.assertNotIn('PAGINATED_DIRECT_TEMPLATES', settings) @@ -191,4 +192,82 @@ class TestSettingsConfiguration(unittest.TestCase): settings['EXTRA_TEMPLATES_PATHS'] = ['/ha'] settings['THEME_TEMPLATES_OVERRIDES'] = ['/foo/bar'] - self.assertRaises(Exception, configure_settings, settings) + self.assertRaises(Exception, handle_deprecated_settings, settings) + + def test_slug_and_slug_regex_substitutions_exception(self): + settings = {} + settings['SLUG_REGEX_SUBSTITUTIONS'] = [('C++', 'cpp')] + settings['TAG_SUBSTITUTIONS'] = [('C#', 'csharp')] + + self.assertRaises(Exception, handle_deprecated_settings, settings) + + def test_deprecated_slug_substitutions(self): + default_slug_regex_subs = self.settings['SLUG_REGEX_SUBSTITUTIONS'] + + # If no deprecated setting is set, don't set new ones + settings = {} + settings = handle_deprecated_settings(settings) + self.assertNotIn('SLUG_REGEX_SUBSTITUTIONS', settings) + self.assertNotIn('TAG_REGEX_SUBSTITUTIONS', settings) + self.assertNotIn('CATEGORY_REGEX_SUBSTITUTIONS', settings) + self.assertNotIn('AUTHOR_REGEX_SUBSTITUTIONS', settings) + + # If SLUG_SUBSTITUTIONS is set, set {SLUG, AUTHOR}_REGEX_SUBSTITUTIONS + # correctly, don't set {CATEGORY, TAG}_REGEX_SUBSTITUTIONS + settings = {} + settings['SLUG_SUBSTITUTIONS'] = [('C++', 'cpp')] + settings = handle_deprecated_settings(settings) + self.assertEqual(settings.get('SLUG_REGEX_SUBSTITUTIONS'), + [(r'C\+\+', 'cpp')] + default_slug_regex_subs) + self.assertNotIn('TAG_REGEX_SUBSTITUTIONS', settings) + self.assertNotIn('CATEGORY_REGEX_SUBSTITUTIONS', settings) + self.assertEqual(settings.get('AUTHOR_REGEX_SUBSTITUTIONS'), + default_slug_regex_subs) + + # If {CATEGORY, TAG, AUTHOR}_SUBSTITUTIONS are set, set + # {CATEGORY, TAG, AUTHOR}_REGEX_SUBSTITUTIONS correctly, don't set + # SLUG_REGEX_SUBSTITUTIONS + settings = {} + settings['TAG_SUBSTITUTIONS'] = [('C#', 'csharp')] + settings['CATEGORY_SUBSTITUTIONS'] = [('C#', 'csharp')] + settings['AUTHOR_SUBSTITUTIONS'] = [('Alexander Todorov', 'atodorov')] + settings = handle_deprecated_settings(settings) + self.assertNotIn('SLUG_REGEX_SUBSTITUTIONS', settings) + self.assertEqual(settings['TAG_REGEX_SUBSTITUTIONS'], + [(r'C\#', 'csharp')] + default_slug_regex_subs) + self.assertEqual(settings['CATEGORY_REGEX_SUBSTITUTIONS'], + [(r'C\#', 'csharp')] + default_slug_regex_subs) + self.assertEqual(settings['AUTHOR_REGEX_SUBSTITUTIONS'], + [(r'Alexander\ Todorov', 'atodorov')] + + default_slug_regex_subs) + + # If {SLUG, CATEGORY, TAG, AUTHOR}_SUBSTITUTIONS are set, set + # {SLUG, CATEGORY, TAG, AUTHOR}_REGEX_SUBSTITUTIONS correctly + settings = {} + settings['SLUG_SUBSTITUTIONS'] = [('C++', 'cpp')] + settings['TAG_SUBSTITUTIONS'] = [('C#', 'csharp')] + settings['CATEGORY_SUBSTITUTIONS'] = [('C#', 'csharp')] + settings['AUTHOR_SUBSTITUTIONS'] = [('Alexander Todorov', 'atodorov')] + settings = handle_deprecated_settings(settings) + self.assertEqual(settings['TAG_REGEX_SUBSTITUTIONS'], + [(r'C\+\+', 'cpp')] + [(r'C\#', 'csharp')] + + default_slug_regex_subs) + self.assertEqual(settings['CATEGORY_REGEX_SUBSTITUTIONS'], + [(r'C\+\+', 'cpp')] + [(r'C\#', 'csharp')] + + default_slug_regex_subs) + self.assertEqual(settings['AUTHOR_REGEX_SUBSTITUTIONS'], + [(r'Alexander\ Todorov', 'atodorov')] + + default_slug_regex_subs) + + # Handle old 'skip' flags correctly + settings = {} + settings['SLUG_SUBSTITUTIONS'] = [('C++', 'cpp', True)] + settings['AUTHOR_SUBSTITUTIONS'] = [('Alexander Todorov', 'atodorov', + False)] + settings = handle_deprecated_settings(settings) + self.assertEqual(settings.get('SLUG_REGEX_SUBSTITUTIONS'), + [(r'C\+\+', 'cpp')] + + [(r'(?u)\A\s*', ''), (r'(?u)\s*\Z', '')]) + self.assertEqual(settings['AUTHOR_REGEX_SUBSTITUTIONS'], + [(r'Alexander\ Todorov', 'atodorov')] + + default_slug_regex_subs) diff --git a/pelican/tests/test_urlwrappers.py b/pelican/tests/test_urlwrappers.py index 21a7d98f..8ff3d9d6 100644 --- a/pelican/tests/test_urlwrappers.py +++ b/pelican/tests/test_urlwrappers.py @@ -55,30 +55,29 @@ class TestURLWrapper(unittest.TestCase): self.assertEqual(author, author_equal) cat_ascii = Category('指導書', settings={}) - self.assertEqual(cat_ascii, u'zhi-dao-shu') + self.assertEqual(cat_ascii, u'zhi dao shu') def test_slugify_with_substitutions_and_dots(self): - tag = Tag('Tag Dot', - settings={ - 'TAG_SUBSTITUTIONS': [('Tag Dot', 'tag.dot', True)] - }) + tag = Tag('Tag Dot', settings={'TAG_REGEX_SUBSTITUTIONS': [ + ('Tag Dot', 'tag.dot'), + ]}) cat = Category('Category Dot', - settings={ - 'CATEGORY_SUBSTITUTIONS': (('Category Dot', - 'cat.dot', - True),) - }) + settings={'CATEGORY_REGEX_SUBSTITUTIONS': [ + ('Category Dot', 'cat.dot'), + ]}) self.assertEqual(tag.slug, 'tag.dot') self.assertEqual(cat.slug, 'cat.dot') def test_author_slug_substitutions(self): - settings = { - 'AUTHOR_SUBSTITUTIONS': [ - ('Alexander Todorov', 'atodorov', False), - ('Krasimir Tsonev', 'krasimir', False), - ] - } + settings = {'AUTHOR_REGEX_SUBSTITUTIONS': [ + ('Alexander Todorov', 'atodorov'), + ('Krasimir Tsonev', 'krasimir'), + (r'[^\w\s-]', ''), + (r'(?u)\A\s*', ''), + (r'(?u)\s*\Z', ''), + (r'[-\s]+', '-'), + ]} author1 = Author('Mr. Senko', settings=settings) author2 = Author('Alexander Todorov', settings=settings) diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index b5b8b454..2c6c4cd8 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -119,8 +119,11 @@ class TestUtils(LoggedTestCase): ('大飯原発4号機、18日夜起動へ', 'da-fan-yuan-fa-4hao-ji-18ri-ye-qi-dong-he'),) + settings = read_settings() + subs = settings['SLUG_REGEX_SUBSTITUTIONS'] + for value, expected in samples: - self.assertEqual(utils.slugify(value), expected) + self.assertEqual(utils.slugify(value, regex_subs=subs), expected) def test_slugify_substitute(self): @@ -129,21 +132,27 @@ class TestUtils(LoggedTestCase): ('c++, c#, C#, C++', 'cpp-c-sharp-c-sharp-cpp'), ('c++-streams', 'cpp-streams'),) - subs = (('C++', 'CPP'), ('C#', 'C-SHARP')) + settings = read_settings() + subs = [ + (r'C\+\+', 'CPP'), + (r'C#', 'C-SHARP'), + ] + settings['SLUG_REGEX_SUBSTITUTIONS'] for value, expected in samples: - self.assertEqual(utils.slugify(value, subs), expected) + self.assertEqual(utils.slugify(value, regex_subs=subs), expected) def test_slugify_substitute_and_keeping_non_alphanum(self): samples = (('Fedora QA', 'fedora.qa'), ('C++ is used by Fedora QA', 'cpp is used by fedora.qa'), - ('C++ is based on C', 'cpp-is-based-on-c'), - ('C+++ test C+ test', 'cpp-test-c-test'),) + ('C++ is based on C', 'cpp is based on c'), + ('C+++ test C+ test', 'cpp+ test c+ test'),) - subs = (('Fedora QA', 'fedora.qa', True), - ('c++', 'cpp'),) + subs = [ + (r'Fedora QA', 'fedora.qa'), + (r'c\+\+', 'cpp'), + ] for value, expected in samples: - self.assertEqual(utils.slugify(value, subs), expected) + self.assertEqual(utils.slugify(value, regex_subs=subs), expected) def test_get_relative_path(self): diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 5b09fdae..b7b26283 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -17,6 +17,7 @@ from six.moves.urllib.request import urlretrieve # because logging.setLoggerClass has to be called before logging.getLogger from pelican.log import init +from pelican.settings import read_settings from pelican.utils import SafeDatetime, slugify try: @@ -291,6 +292,8 @@ def dc2fields(file): print("%i posts read." % len(posts)) + settings = read_settings() + subs = settings['SLUG_REGEX_SUBSTITUTIONS'] for post in posts: fields = post.split('","') @@ -383,8 +386,9 @@ def dc2fields(file): kind = 'article' # TODO: Recognise pages status = 'published' # TODO: Find a way for draft posts - yield (post_title, content, slugify(post_title), post_creadt, author, - categories, tags, status, kind, post_format) + yield (post_title, content, slugify(post_title, regex_subs=subs), + post_creadt, author, categories, tags, status, kind, + post_format) def posterous2fields(api_token, email, password): @@ -418,6 +422,8 @@ def posterous2fields(api_token, email, password): page = 1 posts = get_posterous_posts(api_token, email, password, page) + settings = read_settings() + subs = settings['SLUG_REGEX_SUBSTITUTIONS'] while len(posts) > 0: posts = get_posterous_posts(api_token, email, password, page) page += 1 @@ -425,7 +431,7 @@ def posterous2fields(api_token, email, password): for post in posts: slug = post.get('slug') if not slug: - slug = slugify(post.get('title')) + slug = slugify(post.get('title'), regex_subs=subs) tags = [tag.get('name') for tag in post.get('tags')] raw_date = post.get('display_date') date_object = SafeDatetime.strptime( @@ -469,13 +475,15 @@ def tumblr2fields(api_key, blogname): offset = 0 posts = get_tumblr_posts(api_key, blogname, offset) + settings = read_settings() + subs = settings['SLUG_REGEX_SUBSTITUTIONS'] while len(posts) > 0: for post in posts: title = \ post.get('title') or \ post.get('source_title') or \ post.get('type').capitalize() - slug = post.get('slug') or slugify(title) + slug = post.get('slug') or slugify(title, regex_subs=subs) tags = post.get('tags') timestamp = post.get('timestamp') date = SafeDatetime.fromtimestamp(int(timestamp)).strftime( @@ -552,6 +560,8 @@ def feed2fields(file): """Read a feed and yield pelican fields""" import feedparser d = feedparser.parse(file) + settings = read_settings() + subs = settings['SLUG_REGEX_SUBSTITUTIONS'] for entry in d.entries: date = (entry.updated_parsed.strftime('%Y-%m-%d %H:%M') if hasattr(entry, 'updated_parsed') else None) @@ -559,7 +569,7 @@ def feed2fields(file): tags = ([e['term'] for e in entry.tags] if hasattr(entry, 'tags') else None) - slug = slugify(entry.title) + slug = slugify(entry.title, regex_subs=subs) kind = 'article' yield (entry.title, entry.description, slug, date, author, [], tags, None, kind, 'html') @@ -621,7 +631,7 @@ def get_ext(out_markup, in_markup='html'): def get_out_filename(output_path, filename, ext, kind, - dirpage, dircat, categories, wp_custpost): + dirpage, dircat, categories, wp_custpost, slug_subs): filename = os.path.basename(filename) # Enforce filename restrictions for various filesystems at once; see @@ -647,12 +657,12 @@ def get_out_filename(output_path, filename, ext, kind, # create subdirectories with category names elif kind != 'article': if wp_custpost: - typename = slugify(kind) + typename = slugify(kind, regex_subs=slug_subs) else: typename = '' kind = 'article' if dircat and (len(categories) > 0): - catname = slugify(categories[0]) + catname = slugify(categories[0], regex_subs=slug_subs) else: catname = '' out_filename = os.path.join(output_path, typename, @@ -661,7 +671,7 @@ def get_out_filename(output_path, filename, ext, kind, os.makedirs(os.path.join(output_path, typename, catname)) # option to put files in directories with categories names elif dircat and (len(categories) > 0): - catname = slugify(categories[0]) + catname = slugify(categories[0], regex_subs=slug_subs) out_filename = os.path.join(output_path, catname, filename + ext) if not os.path.isdir(os.path.join(output_path, catname)): os.mkdir(os.path.join(output_path, catname)) @@ -768,6 +778,9 @@ def fields2pelican( 'requested import action.') exit(error) + settings = read_settings() + slug_subs = settings['SLUG_REGEX_SUBSTITUTIONS'] + for (title, content, filename, date, author, categories, tags, status, kind, in_markup) in fields: if filter_author and filter_author != author: @@ -796,7 +809,7 @@ def fields2pelican( out_filename = get_out_filename( output_path, filename, ext, kind, dirpage, dircat, - categories, wp_custpost) + categories, wp_custpost, slug_subs) print(out_filename) if in_markup in ('html', 'wp-html'): diff --git a/pelican/urlwrappers.py b/pelican/urlwrappers.py index 7659471d..860ca2ff 100644 --- a/pelican/urlwrappers.py +++ b/pelican/urlwrappers.py @@ -36,8 +36,9 @@ class URLWrapper(object): @property def slug(self): if self._slug is None: - self._slug = slugify(self.name, - self.settings.get('SLUG_SUBSTITUTIONS', ())) + self._slug = slugify( + self.name, + regex_subs=self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) return self._slug @slug.setter @@ -56,8 +57,8 @@ class URLWrapper(object): return hash(self.slug) def _normalize_key(self, key): - subs = self.settings.get('SLUG_SUBSTITUTIONS', ()) - return six.text_type(slugify(key, subs)) + subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []) + return six.text_type(slugify(key, regex_subs=subs)) def __eq__(self, other): if isinstance(other, self.__class__): @@ -115,10 +116,11 @@ class Category(URLWrapper): @property def slug(self): if self._slug is None: - substitutions = self.settings.get('SLUG_SUBSTITUTIONS', ()) - substitutions += tuple(self.settings.get('CATEGORY_SUBSTITUTIONS', - ())) - self._slug = slugify(self.name, substitutions) + if 'CATEGORY_REGEX_SUBSTITUTIONS' in self.settings: + subs = self.settings['CATEGORY_REGEX_SUBSTITUTIONS'] + else: + subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []) + self._slug = slugify(self.name, regex_subs=subs) return self._slug @@ -129,9 +131,11 @@ class Tag(URLWrapper): @property def slug(self): if self._slug is None: - substitutions = self.settings.get('SLUG_SUBSTITUTIONS', ()) - substitutions += tuple(self.settings.get('TAG_SUBSTITUTIONS', ())) - self._slug = slugify(self.name, substitutions) + if 'TAG_REGEX_SUBSTITUTIONS' in self.settings: + subs = self.settings['TAG_REGEX_SUBSTITUTIONS'] + else: + subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []) + self._slug = slugify(self.name, regex_subs=subs) return self._slug @@ -139,6 +143,9 @@ class Author(URLWrapper): @property def slug(self): if self._slug is None: - self._slug = slugify(self.name, - self.settings.get('AUTHOR_SUBSTITUTIONS', ())) + if 'AUTHOR_REGEX_SUBSTITUTIONS' in self.settings: + subs = self.settings['AUTHOR_REGEX_SUBSTITUTIONS'] + else: + subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []) + self._slug = slugify(self.name, regex_subs=subs) return self._slug diff --git a/pelican/utils.py b/pelican/utils.py index 573603da..efc32e0c 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -263,13 +263,14 @@ def pelican_open(filename, mode='rb', strip_crs=(sys.platform == 'win32')): yield content -def slugify(value, substitutions=()): +def slugify(value, regex_subs=()): """ Normalizes string, converts to lowercase, removes non-alpha characters, and converts spaces to hyphens. Took from Django sources. """ + # TODO Maybe steal again from current Django 1.5dev value = Markup(value).striptags() # value must be unicode per se @@ -281,37 +282,16 @@ def slugify(value, substitutions=()): if isinstance(value, six.binary_type): value = value.decode('ascii') # still unicode - value = unicodedata.normalize('NFKD', value).lower() + value = unicodedata.normalize('NFKD', value) - # backward compatible covert from 2-tuples to 3-tuples - new_subs = [] - for tpl in substitutions: - try: - src, dst, skip = tpl - except ValueError: - src, dst = tpl - skip = False - new_subs.append((src, dst, skip)) - substitutions = tuple(new_subs) + for src, dst in regex_subs: + value = re.sub(src, dst, value, flags=re.IGNORECASE) - # by default will replace non-alphanum characters - replace = True - for src, dst, skip in substitutions: - orig_value = value - value = value.replace(src.lower(), dst.lower()) - # if replacement was made then skip non-alphanum - # replacement if instructed to do so - if value != orig_value: - replace = replace and not skip - - if replace: - value = re.sub(r'[^\w\s-]', '', value).strip() - value = re.sub(r'[-\s]+', '-', value) - else: - value = value.strip() + # convert to lowercase + value = value.lower() # we want only ASCII chars - value = value.encode('ascii', 'ignore') + value = value.encode('ascii', 'ignore').strip() # but Pelican should generally use only unicode return value.decode('ascii')