From 03d9c38871e0561c8796350d9f27bfc674027fc2 Mon Sep 17 00:00:00 2001 From: Deniz Turgut Date: Sun, 19 Apr 2020 17:23:26 +0300 Subject: [PATCH 1/3] Rewrite pelican.utils.slugify to use unicode and add tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a use_unicode kwarg to slugify to keep unicode characters as is (no ASCII-fying) and add tests for it. Also reworks how slugification logic. slugify started with the Django method for slugiying: - Normalize to compatibility decomposed from (NFKD) - Encode and decode with 'ascii' This works fine if the decomposed form contains ASCII characters (i.e. ç can be changed in to c+CEDILLA and ASCII would keep c only), but fails when decomposition doesn't result in ASCII characters (i.e. Chinese). To solve that 'unidecode' was added, which works fine for both cases. However, old method is now redundant but was kept. This commit removes the old method and adjusts logic slightly. Now slugify will normalize all text with composition mode (NFKC) to unify format for regex substitutions. And then if use_unicode is False, uses unidecode to convert it to ASCII. --- pelican/tests/test_utils.py | 39 ++++++++++++++++++++++++++++++++++++ pelican/utils.py | 40 ++++++++++++++++++++++--------------- 2 files changed, 63 insertions(+), 16 deletions(-) diff --git a/pelican/tests/test_utils.py b/pelican/tests/test_utils.py index 3a8c4bd8..4a03cdc4 100644 --- a/pelican/tests/test_utils.py +++ b/pelican/tests/test_utils.py @@ -128,6 +128,45 @@ class TestUtils(LoggedTestCase): self.assertEqual( utils.slugify('Cat', regex_subs=subs, preserve_case=True), 'Cat') + def test_slugify_use_unicode(self): + + samples = ( + ('this is a test', 'this-is-a-test'), + ('this is a test', 'this-is-a-test'), + ('this → is ← a ↑ test', 'this-is-a-test'), + ('this--is---a test', 'this-is-a-test'), + ('unicode測試許功蓋,你看到了嗎?', 'unicode測試許功蓋你看到了嗎'), + ('Çığ', 'çığ') + ) + + settings = read_settings() + subs = settings['SLUG_REGEX_SUBSTITUTIONS'] + + for value, expected in samples: + self.assertEqual( + utils.slugify(value, regex_subs=subs, use_unicode=True), + expected) + + # check with preserve case + for value, expected in samples: + self.assertEqual( + utils.slugify('Çığ', regex_subs=subs, + preserve_case=True, use_unicode=True), + 'Çığ') + + # check normalization + samples = ( + ('大飯原発4号機、18日夜起動へ', '大飯原発4号機18日夜起動へ'), + ( + '\N{LATIN SMALL LETTER C}\N{COMBINING CEDILLA}', + '\N{LATIN SMALL LETTER C WITH CEDILLA}' + ) + ) + for value, expected in samples: + self.assertEqual( + utils.slugify(value, regex_subs=subs, use_unicode=True), + expected) + def test_slugify_substitute(self): samples = (('C++ is based on C', 'cpp-is-based-on-c'), diff --git a/pelican/utils.py b/pelican/utils.py index b1536de8..c1b79ed9 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -222,7 +222,7 @@ def pelican_open(filename, mode='r', strip_crs=(sys.platform == 'win32')): yield content -def slugify(value, regex_subs=(), preserve_case=False): +def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False): """ Normalizes string, converts to lowercase, removes non-alpha characters, and converts spaces to hyphens. @@ -230,28 +230,36 @@ def slugify(value, regex_subs=(), preserve_case=False): Took from Django sources. """ - # TODO Maybe steal again from current Django 1.5dev - value = Markup(value).striptags() - # value must be unicode per se import unicodedata - from unidecode import unidecode - value = unidecode(value) - if isinstance(value, bytes): - value = value.decode('ascii') - # still unicode - value = unicodedata.normalize('NFKD', value) + import unidecode + def normalize_unicode(text): + # normalize text by compatibility composition + # see: https://en.wikipedia.org/wiki/Unicode_equivalence + return unicodedata.normalize('NFKC', text) + + # strip tags from value + value = Markup(value).striptags() + + # normalization + value = normalize_unicode(value) + + if not use_unicode: + # ASCII-fy + value = unidecode.unidecode(value) + + # perform regex substitutions for src, dst in regex_subs: - value = re.sub(src, dst, value, flags=re.IGNORECASE) + value = re.sub( + normalize_unicode(src), + normalize_unicode(dst), + value, + flags=re.IGNORECASE) - # convert to lowercase if not preserve_case: value = value.lower() - # we want only ASCII chars - value = value.encode('ascii', 'ignore').strip() - # but Pelican should generally use only unicode - return value.decode('ascii') + return value.strip() def copy(source, destination, ignores=None): From 97fe235e60a8f46346b35fa32a02ff9e1e5ba395 Mon Sep 17 00:00:00 2001 From: Deniz Turgut Date: Sun, 19 Apr 2020 18:51:55 +0300 Subject: [PATCH 2/3] Expose use_unicode setting of slugify in settings and use it --- docs/settings.rst | 19 +++++++++++++------ pelican/contents.py | 15 ++++++++------- pelican/settings.py | 1 + pelican/tests/test_contents.py | 11 +++++++++++ pelican/urlwrappers.py | 26 +++++++++++++++----------- 5 files changed, 48 insertions(+), 24 deletions(-) diff --git a/docs/settings.rst b/docs/settings.rst index 48344076..60e539d3 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -320,12 +320,6 @@ Basic settings A list of default Pygments settings for your reStructuredText code blocks. See :ref:`internal_pygments_options` for a list of supported options. -.. data:: SLUGIFY_SOURCE = 'title' - - Specifies where you want the slug to be automatically generated from. Can be - set to ``title`` to use the 'Title:' metadata tag or ``basename`` to use the - article's file name when creating the slug. - .. data:: CACHE_CONTENT = False If ``True``, saves content in caches. See @@ -621,6 +615,19 @@ corresponding ``*_URL`` setting as string, while others hard-code them: ``'archives.html'``, ``'authors.html'``, ``'categories.html'``, ``'tags.html'``. + +.. data:: SLUGIFY_SOURCE = 'title' + + Specifies where you want the slug to be automatically generated from. Can be + set to ``title`` to use the 'Title:' metadata tag or ``basename`` to use the + article's file name when creating the slug. + +.. data:: SLUGIFY_USE_UNICODE = False + + Allow unicode characters in slugs. Set ``True`` to keep unicode characters + in auto-generated slugs. Otherwise, unicode characters will be replaced + with ASCII equivalents. + .. data:: SLUG_REGEX_SUBSTITUTIONS = [ (r'[^\\w\\s-]', ''), # remove non-alphabetical/whitespace/'-' chars (r'(?u)\\A\\s*', ''), # strip leading whitespace diff --git a/pelican/contents.py b/pelican/contents.py index 40d9c28e..b49e1f2e 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -92,16 +92,17 @@ class Content(object): if not hasattr(self, 'slug'): if (settings['SLUGIFY_SOURCE'] == 'title' and hasattr(self, 'title')): - self.slug = slugify( - self.title, - regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) + value = self.title elif (settings['SLUGIFY_SOURCE'] == 'basename' and source_path is not None): - basename = os.path.basename( - os.path.splitext(source_path)[0]) + value = os.path.basename(os.path.splitext(source_path)[0]) + else: + value = None + if value is not None: self.slug = slugify( - basename, - regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) + value, + regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []), + use_unicode=settings['SLUGIFY_USE_UNICODE']) self.source_path = source_path self.relative_source_path = self.get_relative_source_path() diff --git a/pelican/settings.py b/pelican/settings.py index 77aea059..0e0397c9 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -155,6 +155,7 @@ DEFAULT_CONFIG = { ], 'INTRASITE_LINK_REGEX': '[{|](?P.*?)[|}]', 'SLUGIFY_SOURCE': 'title', + 'SLUGIFY_USE_UNICODE': False, 'CACHE_CONTENT': False, 'CONTENT_CACHING_LAYER': 'reader', 'CACHE_PATH': 'cache', diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index 08d4eb73..2a377e75 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -135,6 +135,17 @@ class TestPage(LoggedTestCase): page = Page(**page_kwargs) self.assertEqual(page.slug, 'foo') + # test slug from unicode title + # slug doesn't use unicode + settings['SLUGIFY_SOURCE'] = "title" + page_kwargs['metadata']['title'] = '指導書' + page = Page(**page_kwargs) + self.assertEqual(page.slug, 'zhi-dao-shu') + # slug uses unicode + settings['SLUGIFY_USE_UNICODE'] = True + page = Page(**page_kwargs) + self.assertEqual(page.slug, '指導書') + def test_defaultlang(self): # If no lang is given, default to the default one. page = Page(**self.page_kwargs) diff --git a/pelican/urlwrappers.py b/pelican/urlwrappers.py index cc276b3f..86f9ef32 100644 --- a/pelican/urlwrappers.py +++ b/pelican/urlwrappers.py @@ -34,15 +34,14 @@ class URLWrapper(object): if self._slug is None: class_key = '{}_REGEX_SUBSTITUTIONS'.format( self.__class__.__name__.upper()) - if class_key in self.settings: - self._slug = slugify( - self.name, - regex_subs=self.settings[class_key]) - else: - self._slug = slugify( - self.name, - regex_subs=self.settings.get( - 'SLUG_REGEX_SUBSTITUTIONS', [])) + regex_subs = self.settings.get( + class_key, + self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) + self._slug = slugify( + self.name, + regex_subs=regex_subs, + use_unicode=self.settings.get('SLUGIFY_USE_UNICODE', False) + ) return self._slug @slug.setter @@ -61,8 +60,13 @@ class URLWrapper(object): return hash(self.slug) def _normalize_key(self, key): - subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []) - return slugify(key, regex_subs=subs) + class_key = '{}_REGEX_SUBSTITUTIONS'.format( + self.__class__.__name__.upper()) + regex_subs = self.settings.get( + class_key, + self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) + use_unicode = self.settings.get('SLUGIFY_USE_UNICODE', False) + return slugify(key, regex_subs=regex_subs, use_unicode=use_unicode) def __eq__(self, other): if isinstance(other, self.__class__): From bd699d34e88da469fa71e904e9ce3f7df6276c84 Mon Sep 17 00:00:00 2001 From: Deniz Turgut Date: Tue, 21 Apr 2020 00:26:00 +0300 Subject: [PATCH 3/3] Expose preserve_case option from slugify --- docs/settings.rst | 6 ++++++ pelican/contents.py | 3 ++- pelican/settings.py | 1 + pelican/tests/test_contents.py | 35 ++++++++++++++++++++++++---------- pelican/urlwrappers.py | 9 ++++++++- 5 files changed, 42 insertions(+), 12 deletions(-) diff --git a/docs/settings.rst b/docs/settings.rst index 60e539d3..8c021563 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -628,6 +628,12 @@ corresponding ``*_URL`` setting as string, while others hard-code them: in auto-generated slugs. Otherwise, unicode characters will be replaced with ASCII equivalents. + +.. data:: SLUGIFY_PRESERVE_CASE = False + + Preserve uppercase characters in the slugs. Set ``True`` to keep the + uppercase characters in the ``SLUGIFY_SOURCE`` as is. + .. data:: SLUG_REGEX_SUBSTITUTIONS = [ (r'[^\\w\\s-]', ''), # remove non-alphabetical/whitespace/'-' chars (r'(?u)\\A\\s*', ''), # strip leading whitespace diff --git a/pelican/contents.py b/pelican/contents.py index b49e1f2e..d01b241f 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -102,7 +102,8 @@ class Content(object): self.slug = slugify( value, regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []), - use_unicode=settings['SLUGIFY_USE_UNICODE']) + preserve_case=settings.get('SLUGIFY_PRESERVE_CASE', False), + use_unicode=settings.get('SLUGIFY_USE_UNICODE', False)) self.source_path = source_path self.relative_source_path = self.get_relative_source_path() diff --git a/pelican/settings.py b/pelican/settings.py index 0e0397c9..85cacbf0 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -156,6 +156,7 @@ DEFAULT_CONFIG = { 'INTRASITE_LINK_REGEX': '[{|](?P.*?)[|}]', 'SLUGIFY_SOURCE': 'title', 'SLUGIFY_USE_UNICODE': False, + 'SLUGIFY_PRESERVE_CASE': False, 'CACHE_CONTENT': False, 'CONTENT_CACHING_LAYER': 'reader', 'CACHE_PATH': 'cache', diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index 2a377e75..ebde9c3c 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -135,16 +135,31 @@ class TestPage(LoggedTestCase): page = Page(**page_kwargs) self.assertEqual(page.slug, 'foo') - # test slug from unicode title - # slug doesn't use unicode - settings['SLUGIFY_SOURCE'] = "title" - page_kwargs['metadata']['title'] = '指導書' - page = Page(**page_kwargs) - self.assertEqual(page.slug, 'zhi-dao-shu') - # slug uses unicode - settings['SLUGIFY_USE_UNICODE'] = True - page = Page(**page_kwargs) - self.assertEqual(page.slug, '指導書') + # test slug from title with unicode and case + + inputs = ( + # (title, expected, preserve_case, use_unicode) + ('指導書', 'zhi-dao-shu', False, False), + ('指導書', 'Zhi-Dao-Shu', True, False), + ('指導書', '指導書', False, True), + ('指導書', '指導書', True, True), + ('Çığ', 'cig', False, False), + ('Çığ', 'Cig', True, False), + ('Çığ', 'çığ', False, True), + ('Çığ', 'Çığ', True, True), + ) + + settings = get_settings() + page_kwargs = self._copy_page_kwargs() + page_kwargs['settings'] = settings + + for title, expected, preserve_case, use_unicode in inputs: + settings['SLUGIFY_PRESERVE_CASE'] = preserve_case + settings['SLUGIFY_USE_UNICODE'] = use_unicode + page_kwargs['metadata']['title'] = title + page = Page(**page_kwargs) + self.assertEqual(page.slug, expected, + (title, preserve_case, use_unicode)) def test_defaultlang(self): # If no lang is given, default to the default one. diff --git a/pelican/urlwrappers.py b/pelican/urlwrappers.py index 86f9ef32..d01611ba 100644 --- a/pelican/urlwrappers.py +++ b/pelican/urlwrappers.py @@ -37,9 +37,11 @@ class URLWrapper(object): regex_subs = self.settings.get( class_key, self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) + preserve_case = self.settings.get('SLUGIFY_PRESERVE_CASE', False) self._slug = slugify( self.name, regex_subs=regex_subs, + preserve_case=preserve_case, use_unicode=self.settings.get('SLUGIFY_USE_UNICODE', False) ) return self._slug @@ -66,7 +68,12 @@ class URLWrapper(object): class_key, self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])) use_unicode = self.settings.get('SLUGIFY_USE_UNICODE', False) - return slugify(key, regex_subs=regex_subs, use_unicode=use_unicode) + preserve_case = self.settings.get('SLUGIFY_PRESERVE_CASE', False) + return slugify( + key, + regex_subs=regex_subs, + preserve_case=preserve_case, + use_unicode=use_unicode) def __eq__(self, other): if isinstance(other, self.__class__):