diff --git a/docs/settings.rst b/docs/settings.rst index 48344076..8c021563 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -320,12 +320,6 @@ Basic settings A list of default Pygments settings for your reStructuredText code blocks. See :ref:`internal_pygments_options` for a list of supported options. -.. data:: SLUGIFY_SOURCE = 'title' - - Specifies where you want the slug to be automatically generated from. Can be - set to ``title`` to use the 'Title:' metadata tag or ``basename`` to use the - article's file name when creating the slug. - .. data:: CACHE_CONTENT = False If ``True``, saves content in caches. See @@ -621,6 +615,25 @@ corresponding ``*_URL`` setting as string, while others hard-code them: ``'archives.html'``, ``'authors.html'``, ``'categories.html'``, ``'tags.html'``. + +.. data:: SLUGIFY_SOURCE = 'title' + + Specifies where you want the slug to be automatically generated from. Can be + set to ``title`` to use the 'Title:' metadata tag or ``basename`` to use the + article's file name when creating the slug. + +.. data:: SLUGIFY_USE_UNICODE = False + + Allow unicode characters in slugs. Set ``True`` to keep unicode characters + in auto-generated slugs. Otherwise, unicode characters will be replaced + with ASCII equivalents. + + +.. data:: SLUGIFY_PRESERVE_CASE = False + + Preserve uppercase characters in the slugs. Set ``True`` to keep the + uppercase characters in the ``SLUGIFY_SOURCE`` as is. + .. data:: SLUG_REGEX_SUBSTITUTIONS = [ (r'[^\\w\\s-]', ''), # remove non-alphabetical/whitespace/'-' chars (r'(?u)\\A\\s*', ''), # strip leading whitespace diff --git a/pelican/contents.py b/pelican/contents.py index 40d9c28e..d01b241f 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -92,16 +92,18 @@ class Content(object): if not hasattr(self, 'slug'): if (settings['SLUGIFY_SOURCE'] == 'title' and hasattr(self, 'title')): - self.slug = slugify( - self.title, - regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) + value = self.title elif (settings['SLUGIFY_SOURCE'] == 'basename' and source_path is not None): - basename = os.path.basename( - os.path.splitext(source_path)[0]) + value = os.path.basename(os.path.splitext(source_path)[0]) + else: + value = None + if value is not None: self.slug = slugify( - basename, - regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) + value, + regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []), + preserve_case=settings.get('SLUGIFY_PRESERVE_CASE', False), + use_unicode=settings.get('SLUGIFY_USE_UNICODE', False)) self.source_path = source_path self.relative_source_path = self.get_relative_source_path() diff --git a/pelican/settings.py b/pelican/settings.py index b53cde50..d508aa36 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -155,6 +155,8 @@ DEFAULT_CONFIG = { ], 'INTRASITE_LINK_REGEX': '[{|](?P.*?)[|}]', 'SLUGIFY_SOURCE': 'title', + 'SLUGIFY_USE_UNICODE': False, + 'SLUGIFY_PRESERVE_CASE': False, 'CACHE_CONTENT': False, 'CONTENT_CACHING_LAYER': 'reader', 'CACHE_PATH': 'cache', diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index 08d4eb73..ebde9c3c 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -135,6 +135,32 @@ class TestPage(LoggedTestCase): page = Page(**page_kwargs) self.assertEqual(page.slug, 'foo') + # test slug from title with unicode and case + + inputs = ( + # (title, expected, preserve_case, use_unicode) + ('指導書', 'zhi-dao-shu', False, False), + ('指導書', 'Zhi-Dao-Shu', True, False), + ('指導書', '指導書', False, True), + ('指導書', '指導書', True, True), + ('Çığ', 'cig', False, False), + ('Çığ', 'Cig', True, False), + ('Çığ', 'çığ', False, True), + ('Çığ', 'Çığ', True, True), + ) + + settings = get_settings() + page_kwargs = self._copy_page_kwargs() + page_kwargs['settings'] = settings + + for title, expected, preserve_case, use_unicode in inputs: + settings['SLUGIFY_PRESERVE_CASE'] = preserve_case + settings['SLUGIFY_USE_UNICODE'] = use_unicode + page_kwargs['metadata']['title'] = title + page = Page(**page_kwargs) + self.assertEqual(page.slug, expected, + (title, preserve_case, use_unicode)) + def test_defaultlang(self): # If no lang is given, default to the default one. page = Page(**self.page_kwargs) diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 3a8c4bd8..4a03cdc4 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -128,6 +128,45 @@ class TestUtils(LoggedTestCase): self.assertEqual( utils.slugify('Cat', regex_subs=subs, preserve_case=True), 'Cat') + def test_slugify_use_unicode(self): + + samples = ( + ('this is a test', 'this-is-a-test'), + ('this is a test', 'this-is-a-test'), + ('this → is ← a ↑ test', 'this-is-a-test'), + ('this--is---a test', 'this-is-a-test'), + ('unicode測試許功蓋,你看到了嗎?', 'unicode測試許功蓋你看到了嗎'), + ('Çığ', 'çığ') + ) + + settings = read_settings() + subs = settings['SLUG_REGEX_SUBSTITUTIONS'] + + for value, expected in samples: + self.assertEqual( + utils.slugify(value, regex_subs=subs, use_unicode=True), + expected) + + # check with preserve case + for value, expected in samples: + self.assertEqual( + utils.slugify('Çığ', regex_subs=subs, + preserve_case=True, use_unicode=True), + 'Çığ') + + # check normalization + samples = ( + ('大飯原発4号機、18日夜起動へ', '大飯原発4号機18日夜起動へ'), + ( + '\N{LATIN SMALL LETTER C}\N{COMBINING CEDILLA}', + '\N{LATIN SMALL LETTER C WITH CEDILLA}' + ) + ) + for value, expected in samples: + self.assertEqual( + utils.slugify(value, regex_subs=subs, use_unicode=True), + expected) + def test_slugify_substitute(self): samples = (('C++ is based on C', 'cpp-is-based-on-c'), diff --git a/pelican/urlwrappers.py b/pelican/urlwrappers.py index cc276b3f..d01611ba 100644 --- a/pelican/urlwrappers.py +++ b/pelican/urlwrappers.py @@ -34,15 +34,16 @@ class URLWrapper(object): if self._slug is None: class_key = '{}_REGEX_SUBSTITUTIONS'.format( self.__class__.__name__.upper()) - if class_key in self.settings: - self._slug = slugify( - self.name, - regex_subs=self.settings[class_key]) - else: - self._slug = slugify( - self.name, - regex_subs=self.settings.get( - 'SLUG_REGEX_SUBSTITUTIONS', [])) + regex_subs = self.settings.get( + class_key, + self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) + preserve_case = self.settings.get('SLUGIFY_PRESERVE_CASE', False) + self._slug = slugify( + self.name, + regex_subs=regex_subs, + preserve_case=preserve_case, + use_unicode=self.settings.get('SLUGIFY_USE_UNICODE', False) + ) return self._slug @slug.setter @@ -61,8 +62,18 @@ class URLWrapper(object): return hash(self.slug) def _normalize_key(self, key): - subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []) - return slugify(key, regex_subs=subs) + class_key = '{}_REGEX_SUBSTITUTIONS'.format( + self.__class__.__name__.upper()) + regex_subs = self.settings.get( + class_key, + self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) + use_unicode = self.settings.get('SLUGIFY_USE_UNICODE', False) + preserve_case = self.settings.get('SLUGIFY_PRESERVE_CASE', False) + return slugify( + key, + regex_subs=regex_subs, + preserve_case=preserve_case, + use_unicode=use_unicode) def __eq__(self, other): if isinstance(other, self.__class__): diff --git a/pelican/utils.py b/pelican/utils.py index b1536de8..c1b79ed9 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -222,7 +222,7 @@ def pelican_open(filename, mode='r', strip_crs=(sys.platform == 'win32')): yield content -def slugify(value, regex_subs=(), preserve_case=False): +def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False): """ Normalizes string, converts to lowercase, removes non-alpha characters, and converts spaces to hyphens. @@ -230,28 +230,36 @@ def slugify(value, regex_subs=(), preserve_case=False): Took from Django sources. """ - # TODO Maybe steal again from current Django 1.5dev - value = Markup(value).striptags() - # value must be unicode per se import unicodedata - from unidecode import unidecode - value = unidecode(value) - if isinstance(value, bytes): - value = value.decode('ascii') - # still unicode - value = unicodedata.normalize('NFKD', value) + import unidecode + def normalize_unicode(text): + # normalize text by compatibility composition + # see: https://en.wikipedia.org/wiki/Unicode_equivalence + return unicodedata.normalize('NFKC', text) + + # strip tags from value + value = Markup(value).striptags() + + # normalization + value = normalize_unicode(value) + + if not use_unicode: + # ASCII-fy + value = unidecode.unidecode(value) + + # perform regex substitutions for src, dst in regex_subs: - value = re.sub(src, dst, value, flags=re.IGNORECASE) + value = re.sub( + normalize_unicode(src), + normalize_unicode(dst), + value, + flags=re.IGNORECASE) - # convert to lowercase if not preserve_case: value = value.lower() - # we want only ASCII chars - value = value.encode('ascii', 'ignore').strip() - # but Pelican should generally use only unicode - return value.decode('ascii') + return value.strip() def copy(source, destination, ignores=None):