mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
Merge pull request #2731 from avaris/unicode-slugify
Add support for Unicode slugs
This commit is contained in:
commit
7e24886190
7 changed files with 141 additions and 40 deletions
|
|
@ -320,12 +320,6 @@ Basic settings
|
||||||
A list of default Pygments settings for your reStructuredText code blocks.
|
A list of default Pygments settings for your reStructuredText code blocks.
|
||||||
See :ref:`internal_pygments_options` for a list of supported options.
|
See :ref:`internal_pygments_options` for a list of supported options.
|
||||||
|
|
||||||
.. data:: SLUGIFY_SOURCE = 'title'
|
|
||||||
|
|
||||||
Specifies where you want the slug to be automatically generated from. Can be
|
|
||||||
set to ``title`` to use the 'Title:' metadata tag or ``basename`` to use the
|
|
||||||
article's file name when creating the slug.
|
|
||||||
|
|
||||||
.. data:: CACHE_CONTENT = False
|
.. data:: CACHE_CONTENT = False
|
||||||
|
|
||||||
If ``True``, saves content in caches. See
|
If ``True``, saves content in caches. See
|
||||||
|
|
@ -621,6 +615,25 @@ corresponding ``*_URL`` setting as string, while others hard-code them:
|
||||||
``'archives.html'``, ``'authors.html'``, ``'categories.html'``,
|
``'archives.html'``, ``'authors.html'``, ``'categories.html'``,
|
||||||
``'tags.html'``.
|
``'tags.html'``.
|
||||||
|
|
||||||
|
|
||||||
|
.. data:: SLUGIFY_SOURCE = 'title'
|
||||||
|
|
||||||
|
Specifies where you want the slug to be automatically generated from. Can be
|
||||||
|
set to ``title`` to use the 'Title:' metadata tag or ``basename`` to use the
|
||||||
|
article's file name when creating the slug.
|
||||||
|
|
||||||
|
.. data:: SLUGIFY_USE_UNICODE = False
|
||||||
|
|
||||||
|
Allow unicode characters in slugs. Set ``True`` to keep unicode characters
|
||||||
|
in auto-generated slugs. Otherwise, unicode characters will be replaced
|
||||||
|
with ASCII equivalents.
|
||||||
|
|
||||||
|
|
||||||
|
.. data:: SLUGIFY_PRESERVE_CASE = False
|
||||||
|
|
||||||
|
Preserve uppercase characters in the slugs. Set ``True`` to keep the
|
||||||
|
uppercase characters in the ``SLUGIFY_SOURCE`` as is.
|
||||||
|
|
||||||
.. data:: SLUG_REGEX_SUBSTITUTIONS = [
|
.. data:: SLUG_REGEX_SUBSTITUTIONS = [
|
||||||
(r'[^\\w\\s-]', ''), # remove non-alphabetical/whitespace/'-' chars
|
(r'[^\\w\\s-]', ''), # remove non-alphabetical/whitespace/'-' chars
|
||||||
(r'(?u)\\A\\s*', ''), # strip leading whitespace
|
(r'(?u)\\A\\s*', ''), # strip leading whitespace
|
||||||
|
|
|
||||||
|
|
@ -92,16 +92,18 @@ class Content(object):
|
||||||
if not hasattr(self, 'slug'):
|
if not hasattr(self, 'slug'):
|
||||||
if (settings['SLUGIFY_SOURCE'] == 'title' and
|
if (settings['SLUGIFY_SOURCE'] == 'title' and
|
||||||
hasattr(self, 'title')):
|
hasattr(self, 'title')):
|
||||||
self.slug = slugify(
|
value = self.title
|
||||||
self.title,
|
|
||||||
regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
|
|
||||||
elif (settings['SLUGIFY_SOURCE'] == 'basename' and
|
elif (settings['SLUGIFY_SOURCE'] == 'basename' and
|
||||||
source_path is not None):
|
source_path is not None):
|
||||||
basename = os.path.basename(
|
value = os.path.basename(os.path.splitext(source_path)[0])
|
||||||
os.path.splitext(source_path)[0])
|
else:
|
||||||
|
value = None
|
||||||
|
if value is not None:
|
||||||
self.slug = slugify(
|
self.slug = slugify(
|
||||||
basename,
|
value,
|
||||||
regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
|
regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []),
|
||||||
|
preserve_case=settings.get('SLUGIFY_PRESERVE_CASE', False),
|
||||||
|
use_unicode=settings.get('SLUGIFY_USE_UNICODE', False))
|
||||||
|
|
||||||
self.source_path = source_path
|
self.source_path = source_path
|
||||||
self.relative_source_path = self.get_relative_source_path()
|
self.relative_source_path = self.get_relative_source_path()
|
||||||
|
|
|
||||||
|
|
@ -155,6 +155,8 @@ DEFAULT_CONFIG = {
|
||||||
],
|
],
|
||||||
'INTRASITE_LINK_REGEX': '[{|](?P<what>.*?)[|}]',
|
'INTRASITE_LINK_REGEX': '[{|](?P<what>.*?)[|}]',
|
||||||
'SLUGIFY_SOURCE': 'title',
|
'SLUGIFY_SOURCE': 'title',
|
||||||
|
'SLUGIFY_USE_UNICODE': False,
|
||||||
|
'SLUGIFY_PRESERVE_CASE': False,
|
||||||
'CACHE_CONTENT': False,
|
'CACHE_CONTENT': False,
|
||||||
'CONTENT_CACHING_LAYER': 'reader',
|
'CONTENT_CACHING_LAYER': 'reader',
|
||||||
'CACHE_PATH': 'cache',
|
'CACHE_PATH': 'cache',
|
||||||
|
|
|
||||||
|
|
@ -135,6 +135,32 @@ class TestPage(LoggedTestCase):
|
||||||
page = Page(**page_kwargs)
|
page = Page(**page_kwargs)
|
||||||
self.assertEqual(page.slug, 'foo')
|
self.assertEqual(page.slug, 'foo')
|
||||||
|
|
||||||
|
# test slug from title with unicode and case
|
||||||
|
|
||||||
|
inputs = (
|
||||||
|
# (title, expected, preserve_case, use_unicode)
|
||||||
|
('指導書', 'zhi-dao-shu', False, False),
|
||||||
|
('指導書', 'Zhi-Dao-Shu', True, False),
|
||||||
|
('指導書', '指導書', False, True),
|
||||||
|
('指導書', '指導書', True, True),
|
||||||
|
('Çığ', 'cig', False, False),
|
||||||
|
('Çığ', 'Cig', True, False),
|
||||||
|
('Çığ', 'çığ', False, True),
|
||||||
|
('Çığ', 'Çığ', True, True),
|
||||||
|
)
|
||||||
|
|
||||||
|
settings = get_settings()
|
||||||
|
page_kwargs = self._copy_page_kwargs()
|
||||||
|
page_kwargs['settings'] = settings
|
||||||
|
|
||||||
|
for title, expected, preserve_case, use_unicode in inputs:
|
||||||
|
settings['SLUGIFY_PRESERVE_CASE'] = preserve_case
|
||||||
|
settings['SLUGIFY_USE_UNICODE'] = use_unicode
|
||||||
|
page_kwargs['metadata']['title'] = title
|
||||||
|
page = Page(**page_kwargs)
|
||||||
|
self.assertEqual(page.slug, expected,
|
||||||
|
(title, preserve_case, use_unicode))
|
||||||
|
|
||||||
def test_defaultlang(self):
|
def test_defaultlang(self):
|
||||||
# If no lang is given, default to the default one.
|
# If no lang is given, default to the default one.
|
||||||
page = Page(**self.page_kwargs)
|
page = Page(**self.page_kwargs)
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,45 @@ class TestUtils(LoggedTestCase):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
utils.slugify('Cat', regex_subs=subs, preserve_case=True), 'Cat')
|
utils.slugify('Cat', regex_subs=subs, preserve_case=True), 'Cat')
|
||||||
|
|
||||||
|
def test_slugify_use_unicode(self):
|
||||||
|
|
||||||
|
samples = (
|
||||||
|
('this is a test', 'this-is-a-test'),
|
||||||
|
('this is a test', 'this-is-a-test'),
|
||||||
|
('this → is ← a ↑ test', 'this-is-a-test'),
|
||||||
|
('this--is---a test', 'this-is-a-test'),
|
||||||
|
('unicode測試許功蓋,你看到了嗎?', 'unicode測試許功蓋你看到了嗎'),
|
||||||
|
('Çığ', 'çığ')
|
||||||
|
)
|
||||||
|
|
||||||
|
settings = read_settings()
|
||||||
|
subs = settings['SLUG_REGEX_SUBSTITUTIONS']
|
||||||
|
|
||||||
|
for value, expected in samples:
|
||||||
|
self.assertEqual(
|
||||||
|
utils.slugify(value, regex_subs=subs, use_unicode=True),
|
||||||
|
expected)
|
||||||
|
|
||||||
|
# check with preserve case
|
||||||
|
for value, expected in samples:
|
||||||
|
self.assertEqual(
|
||||||
|
utils.slugify('Çığ', regex_subs=subs,
|
||||||
|
preserve_case=True, use_unicode=True),
|
||||||
|
'Çığ')
|
||||||
|
|
||||||
|
# check normalization
|
||||||
|
samples = (
|
||||||
|
('大飯原発4号機、18日夜起動へ', '大飯原発4号機18日夜起動へ'),
|
||||||
|
(
|
||||||
|
'\N{LATIN SMALL LETTER C}\N{COMBINING CEDILLA}',
|
||||||
|
'\N{LATIN SMALL LETTER C WITH CEDILLA}'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for value, expected in samples:
|
||||||
|
self.assertEqual(
|
||||||
|
utils.slugify(value, regex_subs=subs, use_unicode=True),
|
||||||
|
expected)
|
||||||
|
|
||||||
def test_slugify_substitute(self):
|
def test_slugify_substitute(self):
|
||||||
|
|
||||||
samples = (('C++ is based on C', 'cpp-is-based-on-c'),
|
samples = (('C++ is based on C', 'cpp-is-based-on-c'),
|
||||||
|
|
|
||||||
|
|
@ -34,15 +34,16 @@ class URLWrapper(object):
|
||||||
if self._slug is None:
|
if self._slug is None:
|
||||||
class_key = '{}_REGEX_SUBSTITUTIONS'.format(
|
class_key = '{}_REGEX_SUBSTITUTIONS'.format(
|
||||||
self.__class__.__name__.upper())
|
self.__class__.__name__.upper())
|
||||||
if class_key in self.settings:
|
regex_subs = self.settings.get(
|
||||||
self._slug = slugify(
|
class_key,
|
||||||
self.name,
|
self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
|
||||||
regex_subs=self.settings[class_key])
|
preserve_case = self.settings.get('SLUGIFY_PRESERVE_CASE', False)
|
||||||
else:
|
self._slug = slugify(
|
||||||
self._slug = slugify(
|
self.name,
|
||||||
self.name,
|
regex_subs=regex_subs,
|
||||||
regex_subs=self.settings.get(
|
preserve_case=preserve_case,
|
||||||
'SLUG_REGEX_SUBSTITUTIONS', []))
|
use_unicode=self.settings.get('SLUGIFY_USE_UNICODE', False)
|
||||||
|
)
|
||||||
return self._slug
|
return self._slug
|
||||||
|
|
||||||
@slug.setter
|
@slug.setter
|
||||||
|
|
@ -61,8 +62,18 @@ class URLWrapper(object):
|
||||||
return hash(self.slug)
|
return hash(self.slug)
|
||||||
|
|
||||||
def _normalize_key(self, key):
|
def _normalize_key(self, key):
|
||||||
subs = self.settings.get('SLUG_REGEX_SUBSTITUTIONS', [])
|
class_key = '{}_REGEX_SUBSTITUTIONS'.format(
|
||||||
return slugify(key, regex_subs=subs)
|
self.__class__.__name__.upper())
|
||||||
|
regex_subs = self.settings.get(
|
||||||
|
class_key,
|
||||||
|
self.settings.get('SLUG_REGEX_SUBSTITUTIONS', []))
|
||||||
|
use_unicode = self.settings.get('SLUGIFY_USE_UNICODE', False)
|
||||||
|
preserve_case = self.settings.get('SLUGIFY_PRESERVE_CASE', False)
|
||||||
|
return slugify(
|
||||||
|
key,
|
||||||
|
regex_subs=regex_subs,
|
||||||
|
preserve_case=preserve_case,
|
||||||
|
use_unicode=use_unicode)
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
if isinstance(other, self.__class__):
|
if isinstance(other, self.__class__):
|
||||||
|
|
|
||||||
|
|
@ -222,7 +222,7 @@ def pelican_open(filename, mode='r', strip_crs=(sys.platform == 'win32')):
|
||||||
yield content
|
yield content
|
||||||
|
|
||||||
|
|
||||||
def slugify(value, regex_subs=(), preserve_case=False):
|
def slugify(value, regex_subs=(), preserve_case=False, use_unicode=False):
|
||||||
"""
|
"""
|
||||||
Normalizes string, converts to lowercase, removes non-alpha characters,
|
Normalizes string, converts to lowercase, removes non-alpha characters,
|
||||||
and converts spaces to hyphens.
|
and converts spaces to hyphens.
|
||||||
|
|
@ -230,28 +230,36 @@ def slugify(value, regex_subs=(), preserve_case=False):
|
||||||
Took from Django sources.
|
Took from Django sources.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# TODO Maybe steal again from current Django 1.5dev
|
|
||||||
value = Markup(value).striptags()
|
|
||||||
# value must be unicode per se
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from unidecode import unidecode
|
import unidecode
|
||||||
value = unidecode(value)
|
|
||||||
if isinstance(value, bytes):
|
|
||||||
value = value.decode('ascii')
|
|
||||||
# still unicode
|
|
||||||
value = unicodedata.normalize('NFKD', value)
|
|
||||||
|
|
||||||
|
def normalize_unicode(text):
|
||||||
|
# normalize text by compatibility composition
|
||||||
|
# see: https://en.wikipedia.org/wiki/Unicode_equivalence
|
||||||
|
return unicodedata.normalize('NFKC', text)
|
||||||
|
|
||||||
|
# strip tags from value
|
||||||
|
value = Markup(value).striptags()
|
||||||
|
|
||||||
|
# normalization
|
||||||
|
value = normalize_unicode(value)
|
||||||
|
|
||||||
|
if not use_unicode:
|
||||||
|
# ASCII-fy
|
||||||
|
value = unidecode.unidecode(value)
|
||||||
|
|
||||||
|
# perform regex substitutions
|
||||||
for src, dst in regex_subs:
|
for src, dst in regex_subs:
|
||||||
value = re.sub(src, dst, value, flags=re.IGNORECASE)
|
value = re.sub(
|
||||||
|
normalize_unicode(src),
|
||||||
|
normalize_unicode(dst),
|
||||||
|
value,
|
||||||
|
flags=re.IGNORECASE)
|
||||||
|
|
||||||
# convert to lowercase
|
|
||||||
if not preserve_case:
|
if not preserve_case:
|
||||||
value = value.lower()
|
value = value.lower()
|
||||||
|
|
||||||
# we want only ASCII chars
|
return value.strip()
|
||||||
value = value.encode('ascii', 'ignore').strip()
|
|
||||||
# but Pelican should generally use only unicode
|
|
||||||
return value.decode('ascii')
|
|
||||||
|
|
||||||
|
|
||||||
def copy(source, destination, ignores=None):
|
def copy(source, destination, ignores=None):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue