From 648165b839fe9562822fa843cbdbfe224b12dfbd Mon Sep 17 00:00:00 2001 From: "Mr. Senko" Date: Mon, 14 Mar 2016 00:16:58 +0200 Subject: [PATCH] More granular control of tags and categories slugs. Fixes #1873 - add TAG_SUBSTITUTIONS AND CATEGORY_SUBSTITURIONS settings - make slugify keep non-alphanumeric characters if configured --- docs/changelog.rst | 9 ++++++++- docs/settings.rst | 22 ++++++++++++++++++++-- pelican/contents.py | 1 + pelican/tests/test_contents.py | 25 ++++++++++++++++++++++++- pelican/tests/test_urlwrappers.py | 15 +++++++++++++++ pelican/tests/test_utils.py | 12 ++++++++++++ pelican/urlwrappers.py | 17 ++++++++++++++++- pelican/utils.py | 30 +++++++++++++++++++++++++++--- 8 files changed, 123 insertions(+), 8 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index f52d6449..315a6b2b 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -4,7 +4,14 @@ Release history Next release ============ -- Nothing yet +* ``SLUG_SUBSTITUTIONS`` now accepts 3-tuple elements, allowing to keep + non-alphanum characters. Existing 2-tuple configurations will continue to work + without change in behavior. The new 3rd parameter has side effects when there + are multiple substitutions defined. Plese see the docs. +* Tag and category slugs can be controlled with greater precision using the + ``TAG_SUBSTITUTIONS`` and ``CATEGORY_SUBSTITUTIONS`` settings. These also + allow for keeping non-alphanum characters for backward compatibility with + existing URLs. 3.6.3 (2015-08-14) ================== diff --git a/docs/settings.rst b/docs/settings.rst index 0adda992..3a8511bc 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -306,8 +306,12 @@ Setting name (followed by default value, if any) What does it do? ``DAY_ARCHIVE_SAVE_AS = ''`` The location to save per-day archives of your posts. ``SLUG_SUBSTITUTIONS = ()`` Substitutions to make prior to stripping out non-alphanumerics when generating slugs. Specified - as a list of 2-tuples of ``(from, to)`` which are - applied in order. + as a list of 3-tuples of ``(from, to, skip)`` which are + applied in order. ``skip`` is a boolean indicating whether + or not to skip replacement of non-alphanumeric characters. + Useful for backward compatibility with existing URLs. +``CATEGORY_SUBSTITUTIONS = ()`` Added to ``SLUG_SUBSTITUTIONS`` for categories. +``TAG_SUBSTITUTIONS = ()`` Added to ``SLUG_SUBSTITUTIONS`` for tags. ====================================================== ============================================================== .. note:: @@ -317,6 +321,20 @@ Setting name (followed by default value, if any) What does it do? set the corresponding ``*_SAVE_AS`` setting to ``''`` to prevent the relevant page from being generated. +.. note:: + + Substitutions are applied in order with the side effect that keeping + non-alphanum characters applies to the whole string when a replacement + is made. For example if you have the following setting + ``SLUG_SUBSTITUTIONS = (('C++', 'cpp'), ('keep dot', 'keep.dot', True))`` + the string ``Keep Dot`` will be converted to ``keep.dot``, however + ``C++ will keep dot`` will be converted to ``cpp will keep.dot`` instead + of ``cpp-will-keep.dot``! + + If you want to keep non-alphanum characters only for tags or categories + but not other slugs then configure ``TAG_SUBSTITUTIONS`` and + ``CATEGORY_SUBSTITUTIONS`` respectively! + Pelican can optionally create per-year, per-month, and per-day archives of your posts. These secondary archives are disabled by default but are automatically enabled if you supply format strings for their respective ``_SAVE_AS`` settings. diff --git a/pelican/contents.py b/pelican/contents.py index 0123384a..9b6aa971 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -172,6 +172,7 @@ class Content(object): 'lang': getattr(self, 'lang', 'en'), 'date': getattr(self, 'date', SafeDatetime.now()), 'author': self.author.slug if hasattr(self, 'author') else '', + 'tag': self.tag.slug if hasattr(self, 'tag') else '', 'category': self.category.slug if hasattr(self, 'category') else '' }) return metadata diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index 6f0f6dd9..d62d0ed6 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -11,7 +11,7 @@ from jinja2.utils import generate_lorem_ipsum import six -from pelican.contents import Article, Author, Category, Page, Static +from pelican.contents import Article, Author, Category, Page, Static, Tag from pelican.settings import DEFAULT_CONFIG from pelican.signals import content_object_init from pelican.tests.support import LoggedTestCase, get_settings, unittest @@ -457,6 +457,29 @@ class TestArticle(TestPage): self.assertEqual( article.save_as, 'obrien/csharp-stuff/fnord/index.html') + def test_slugify_category_with_dots(self): + settings = get_settings() + settings['CATEGORY_SUBSTITUTIONS'] = [('Fedora QA', 'fedora.qa', True)] + settings['ARTICLE_URL'] = '{category}/{slug}/' + article_kwargs = self._copy_page_kwargs() + article_kwargs['metadata']['category'] = Category('Fedora QA', + settings) + article_kwargs['metadata']['title'] = 'This Week in Fedora QA' + article_kwargs['settings'] = settings + article = Article(**article_kwargs) + self.assertEqual(article.url, 'fedora.qa/this-week-in-fedora-qa/') + + def test_slugify_tags_with_dots(self): + settings = get_settings() + settings['TAG_SUBSTITUTIONS'] = [('Fedora QA', 'fedora.qa', True)] + settings['ARTICLE_URL'] = '{tag}/{slug}/' + article_kwargs = self._copy_page_kwargs() + article_kwargs['metadata']['tag'] = Tag('Fedora QA', settings) + article_kwargs['metadata']['title'] = 'This Week in Fedora QA' + article_kwargs['settings'] = settings + article = Article(**article_kwargs) + self.assertEqual(article.url, 'fedora.qa/this-week-in-fedora-qa/') + class TestStatic(LoggedTestCase): diff --git a/pelican/tests/test_urlwrappers.py b/pelican/tests/test_urlwrappers.py index f3dc8198..db194776 100644 --- a/pelican/tests/test_urlwrappers.py +++ b/pelican/tests/test_urlwrappers.py @@ -56,3 +56,18 @@ class TestURLWrapper(unittest.TestCase): cat_ascii = Category('指導書', settings={}) self.assertEqual(cat_ascii, u'zhi-dao-shu') + + def test_slugify_with_substitutions_and_dots(self): + tag = Tag('Tag Dot', + settings={ + 'TAG_SUBSTITUTIONS': [('Tag Dot', 'tag.dot', True)] + }) + cat = Category('Category Dot', + settings={ + 'CATEGORY_SUBSTITUTIONS': (('Category Dot', + 'cat.dot', + True),) + }) + + self.assertEqual(tag.slug, 'tag.dot') + self.assertEqual(cat.slug, 'cat.dot') diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index fed7e94b..8dfc0b9b 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -131,6 +131,18 @@ class TestUtils(LoggedTestCase): for value, expected in samples: self.assertEqual(utils.slugify(value, subs), expected) + def test_slugify_substitute_and_keeping_non_alphanum(self): + + samples = (('Fedora QA', 'fedora.qa'), + ('C++ is used by Fedora QA', 'cpp is used by fedora.qa'), + ('C++ is based on C', 'cpp-is-based-on-c'), + ('C+++ test C+ test', 'cpp-test-c-test'),) + + subs = (('Fedora QA', 'fedora.qa', True), + ('c++', 'cpp'),) + for value, expected in samples: + self.assertEqual(utils.slugify(value, subs), expected) + def test_get_relative_path(self): samples = ((os.path.join('test', 'test.html'), os.pardir), diff --git a/pelican/urlwrappers.py b/pelican/urlwrappers.py index bf1199a8..e56fea8f 100644 --- a/pelican/urlwrappers.py +++ b/pelican/urlwrappers.py @@ -112,13 +112,28 @@ class URLWrapper(object): class Category(URLWrapper): - pass + @property + def slug(self): + if self._slug is None: + substitutions = self.settings.get('SLUG_SUBSTITUTIONS', ()) + substitutions += tuple(self.settings.get('CATEGORY_SUBSTITUTIONS', + ())) + self._slug = slugify(self.name, substitutions) + return self._slug class Tag(URLWrapper): def __init__(self, name, *args, **kwargs): super(Tag, self).__init__(name.strip(), *args, **kwargs) + @property + def slug(self): + if self._slug is None: + substitutions = self.settings.get('SLUG_SUBSTITUTIONS', ()) + substitutions += tuple(self.settings.get('TAG_SUBSTITUTIONS', ())) + self._slug = slugify(self.name, substitutions) + return self._slug + class Author(URLWrapper): pass diff --git a/pelican/utils.py b/pelican/utils.py index b5685a3b..4e729361 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -270,10 +270,34 @@ def slugify(value, substitutions=()): value = value.decode('ascii') # still unicode value = unicodedata.normalize('NFKD', value).lower() - for src, dst in substitutions: + + # backward compatible covert from 2-tuples to 3-tuples + new_subs = [] + for tpl in substitutions: + try: + src, dst, skip = tpl + except ValueError: + src, dst = tpl + skip = False + new_subs.append((src, dst, skip)) + substitutions = tuple(new_subs) + + # by default will replace non-alphanum characters + replace = True + for src, dst, skip in substitutions: + orig_value = value value = value.replace(src.lower(), dst.lower()) - value = re.sub('[^\w\s-]', '', value).strip() - value = re.sub('[-\s]+', '-', value) + # if replacement was made then skip non-alphanum + # replacement if instructed to do so + if value != orig_value: + replace = replace and not skip + + if replace: + value = re.sub('[^\w\s-]', '', value).strip() + value = re.sub('[-\s]+', '-', value) + else: + value = value.strip() + # we want only ASCII chars value = value.encode('ascii', 'ignore') # but Pelican should generally use only unicode