mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
Addresses #1467
This commit is contained in:
parent
cb11bea1f2
commit
2e04fb8b04
2 changed files with 313 additions and 11 deletions
|
|
@ -466,21 +466,16 @@ class Readers(FileStampDataCacher):
|
|||
|
||||
# eventually filter the content with typogrify if asked so
|
||||
if self.settings['TYPOGRIFY']:
|
||||
from typogrify.filters import typogrify
|
||||
|
||||
def typogrify_wrapper(text):
|
||||
"""Ensures ignore_tags feature is backward compatible"""
|
||||
try:
|
||||
return typogrify(text, self.settings['TYPOGRIFY_IGNORE_TAGS'])
|
||||
except TypeError:
|
||||
return typogrify(text)
|
||||
from pelican.typogrify import Typogrify
|
||||
typogrify = Typogrify()
|
||||
typogrify.ignores = self.settings['TYPOGRIFY_IGNORE_TAGS']
|
||||
|
||||
if content:
|
||||
content = typogrify_wrapper(content)
|
||||
metadata['title'] = typogrify_wrapper(metadata['title'])
|
||||
content = typogrify.filter(content)
|
||||
metadata['title'] = typogrify.filter(metadata['title'])
|
||||
|
||||
if 'summary' in metadata:
|
||||
metadata['summary'] = typogrify_wrapper(metadata['summary'])
|
||||
metadata['summary'] = typogrify.filter(metadata['summary'])
|
||||
|
||||
if context_signal:
|
||||
logger.debug('Signal %s.send(%s, <metadata>)',
|
||||
|
|
|
|||
307
pelican/typogrify.py
Normal file
307
pelican/typogrify.py
Normal file
|
|
@ -0,0 +1,307 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import sys
|
||||
import six
|
||||
|
||||
from six.moves.html_parser import HTMLParser
|
||||
from six.moves.html_entities import name2codepoint
|
||||
|
||||
# Used to represent any tag
|
||||
class any_tag:
|
||||
pass
|
||||
|
||||
class Typogrify(object):
|
||||
|
||||
# class variables
|
||||
__ignores = None
|
||||
__default_ignores = ['pre', 'code', 'script', 'kbd']
|
||||
__filters = ['smartypants','widont','caps','amp','initial_quotes']
|
||||
|
||||
class _HTMLParser(HTMLParser):
|
||||
"""Typogrify HTML Parser: A very simple parser, it determines when
|
||||
HTML text is being processed (as opposed to HTML tags) and applies
|
||||
the typogrify filters to the text"""
|
||||
|
||||
current_pos = 0
|
||||
filtering = True # default is to filter everything
|
||||
intermediate_tags = 0
|
||||
data_buffer = ''
|
||||
new_line_pos = dict()
|
||||
filtered_data_length = 0
|
||||
|
||||
def __init__(self, typogrify, html_doc):
|
||||
self.html_doc = html_doc.strip()
|
||||
try:
|
||||
# Python 3.4+
|
||||
HTMLParser.__init__(self, convert_charrefs=False)
|
||||
except TypeError:
|
||||
HTMLParser.__init__(self)
|
||||
|
||||
# Mark the new line positions - needed to
|
||||
# determine the position within the input string
|
||||
new_line = 1
|
||||
self.new_line_pos[new_line] = 0
|
||||
for index, char in enumerate(self.html_doc):
|
||||
if char == "\n":
|
||||
new_line += 1
|
||||
# Add one due to index being zero based
|
||||
self.new_line_pos[new_line] = index + 1
|
||||
|
||||
self.typogrify = typogrify
|
||||
self.feed(self.html_doc) # start parsing
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
"""Records the current tag and determines if
|
||||
filters should be applied. If intermediate_tags > 0
|
||||
then this tag is already being ignored (not
|
||||
filtered) because a parent was specified to be
|
||||
ignored"""
|
||||
|
||||
if self.intermediate_tags > 0:
|
||||
self.intermediate_tags += 1
|
||||
return
|
||||
|
||||
self.filtering = self.typogrify._should_be_filtered(tag, attrs)
|
||||
self.intermediate_tags = 1 if not self.filtering else 0
|
||||
|
||||
def handle_data(self, data):
|
||||
"""Filters the content of a html text node if
|
||||
it is not being ignored"""
|
||||
|
||||
line_num, offset = self.getpos()
|
||||
new_pos = self.new_line_pos[line_num] + offset
|
||||
self.data_buffer += self.html_doc[self.current_pos:new_pos]
|
||||
|
||||
content = data
|
||||
content = self.typogrify._apply_filters(content, self.lasttag)
|
||||
self.data_buffer += content
|
||||
|
||||
self.current_pos = new_pos + len(data)
|
||||
self.filtered_data_length = len(content)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
"""Used to determine when a tag that is not
|
||||
being filtered has ended"""
|
||||
|
||||
if self.intermediate_tags > 0:
|
||||
self.intermediate_tags -= 1
|
||||
|
||||
# Widont filter needs to be handled here
|
||||
if self.filtering:
|
||||
content = self.data_buffer[-self.filtered_data_length:]
|
||||
content = self.typogrify.widont(tag, content)
|
||||
self.data_buffer = self.data_buffer[:-self.filtered_data_length] + content
|
||||
|
||||
def get_output(self):
|
||||
"""If current_pos has not reached to the end of the
|
||||
document, then it gets appended here"""
|
||||
|
||||
if self.current_pos < len(self.html_doc):
|
||||
self.data_buffer += self.html_doc[self.current_pos:]
|
||||
self.current_pos = len(self.html_doc)
|
||||
|
||||
return self.data_buffer
|
||||
|
||||
def __init__(self):
|
||||
"""Class constructor"""
|
||||
|
||||
# Set default variables
|
||||
self.ignores = [] # sets ignores to defaults
|
||||
|
||||
@property
|
||||
def ignores(self):
|
||||
"""Exposes ignores as a list containing
|
||||
items to be ignored"""
|
||||
pass # make ignore_tags unaccessible
|
||||
|
||||
@ignores.setter
|
||||
def ignores(self, value):
|
||||
"""The setter of the ignore list, the format is
|
||||
as follows: ['div','span.test','#test'] would
|
||||
ignore: the tag div, the tag span if it has
|
||||
a class of test, all id's set to test"""
|
||||
value += self.__default_ignores
|
||||
tags, attributes = self._process_ignores(value)
|
||||
self.__ignores = list([tags, attributes])
|
||||
|
||||
def _process_ignores(self, ignores):
|
||||
"""User specified HTML tags or attributes can be ignored. This
|
||||
method classifies the different ignores into three categories:
|
||||
1) Tags to be ignored (e.g. span, div)
|
||||
2) Attributes to be ignored, with # representing an id, and .
|
||||
representing a class (e.g. #test - ignore all id's that
|
||||
are set to test)
|
||||
3) Attributes on tags, using the same attribute format as
|
||||
mentioned above (e.g. span.test - ignore all span elements
|
||||
that have class set to test)"""
|
||||
|
||||
ignores = set(map(lambda ign: ign.strip(), ignores)) # strip ws and make unique
|
||||
ignored_tags = set() # contains tags that will be ignored
|
||||
ignored_attributes = dict() # contains attributes (classes or ids) to be ignored
|
||||
|
||||
# classify ignores into categories
|
||||
tags = set(filter(lambda x: '.' not in x and '#' not in x, ignores))
|
||||
generic_filters = set(filter(lambda x: x.startswith(('.','#')), ignores))
|
||||
tag_filters = ignores - (tags | generic_filters)
|
||||
|
||||
# tags that are to be ignored
|
||||
for item in tags:
|
||||
ignored_tags.add(item)
|
||||
|
||||
# attributes that are to be ignored
|
||||
ignored_attributes[any_tag] = set()
|
||||
|
||||
for item in generic_filters:
|
||||
ignored_attributes[any_tag].add(item)
|
||||
|
||||
for item in tag_filters:
|
||||
tag_attr = re.split(r'([.#])', item, 1)
|
||||
|
||||
# Do not process if tag is already being ignored
|
||||
if tag_attr[0] not in tags:
|
||||
attr = ignored_attributes.get(tag_attr[0], set())
|
||||
attr.add(tag_attr[1]+tag_attr[2])
|
||||
ignored_attributes[tag_attr[0]] = attr
|
||||
|
||||
return (ignored_tags, ignored_attributes)
|
||||
|
||||
def _should_be_filtered(self, tag, attrs):
|
||||
"""Determines if the current html node should be filtered.
|
||||
A node should be filtered if it's tag or its class or id
|
||||
attribute was not specified to be ignored by the user"""
|
||||
|
||||
# Test if the node's tag should be filtered
|
||||
if self.__ignores[0] and tag in self.__ignores[0]:
|
||||
return False
|
||||
|
||||
# Test if the node's attributes should be filtered
|
||||
filters = self.__ignores[1][any_tag]
|
||||
if tag in self.__ignores[1]:
|
||||
filters |= self.__ignores[1][tag]
|
||||
|
||||
try:
|
||||
if any('.%s' % attr[1] in filters for attr in attrs if attr[0] == 'class'):
|
||||
return False
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
try:
|
||||
if any('#%s' % attr[1] in filters for attr in attrs if attr[0] == 'id'):
|
||||
return False
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return True
|
||||
|
||||
#
|
||||
# Typogrify Filters
|
||||
#
|
||||
def amp(self, text):
|
||||
"""Wraps apersands in HTML with ``<span class="amp">`` so they can be
|
||||
styled with CSS. Apersands are also normalized to ``&``. Requires
|
||||
ampersands to have whitespace or an `` `` on both sides."""
|
||||
|
||||
amp_finder = re.compile(r"""
|
||||
(\s| ) # Group 1: prefixed whitespace
|
||||
(?:&|&|&\#38;) # The actual ampersand (non capturing group)
|
||||
(\s| ) # Group 2: suffixed whitespace
|
||||
""", re.VERBOSE)
|
||||
|
||||
replace_function = lambda match: """%s<span class="amp">&</span>%s""" % match.group(1,2)
|
||||
text = amp_finder.sub(replace_function, text)
|
||||
|
||||
return text
|
||||
|
||||
def caps(self, text):
|
||||
"""Wraps multiple capital letters in ``<span class="caps">``
|
||||
so they can be styled with CSS."""
|
||||
|
||||
cap_finder = re.compile(r"""
|
||||
( # Start group capture
|
||||
(?=(:?\d*[A-Z]){2}) # Positive look ahead: At least two caps interspersed with any amount of digits must exist
|
||||
(?:[A-Z\d']*) # Any amount of caps, digits or dumb apostrophes
|
||||
| # Or
|
||||
(?:[A-Z]+\.\s??){2,} # Caps followed by '.' must be present at least twice (note \s?? which is non-greedy)
|
||||
) # End group capture
|
||||
""", re.VERBOSE)
|
||||
|
||||
replace_function = lambda match: """<span class="caps">%s</span>""" % match.group(1)
|
||||
text = cap_finder.sub(replace_function, text)
|
||||
|
||||
return text
|
||||
|
||||
def widont(self, tag, text):
|
||||
"""Replaces the space between the last two words in a string with `` ``
|
||||
Works in these block tags ``(h1-h6, p, li, dd, dt)`` and also accounts for
|
||||
potential closing inline elements ``a, em, strong, span, b, i``"""
|
||||
|
||||
approved_tags = ['a','em','span','strong','i','b','p','h1',
|
||||
'h2','h3','h4','h5','h6','li','dt','dd']
|
||||
|
||||
# Must be inside an approved tag
|
||||
if tag not in approved_tags:
|
||||
return text
|
||||
|
||||
widont_finder = re.compile(r"""
|
||||
(.*) # Group 1: captures everything except the final whitespace before a word
|
||||
\s+ # The final whitespace before the word
|
||||
(\S) # The actual word
|
||||
\s* # Optional whitespace (which is removed if present)
|
||||
""", re.VERBOSE)
|
||||
|
||||
replace_function = lambda match: '%s %s' % match.group(1, 2)
|
||||
text = widont_finder.sub(replace_function, text)
|
||||
|
||||
return text
|
||||
|
||||
def initial_quotes(self, text):
|
||||
"""Wraps initial quotes in ``class="dquo"`` for double quotes or
|
||||
``class="quo"`` for single quotes"""
|
||||
|
||||
quote_finder = re.compile(r"""
|
||||
( # Start group capture
|
||||
("|“|&\#8220;) # A double quote
|
||||
| # Or
|
||||
('|‘|&\#8216;) # A single quote
|
||||
) # End group capture
|
||||
""", re.VERBOSE)
|
||||
|
||||
replace_function = lambda match: """<span class="%s">%s</span>"""\
|
||||
% ('dquo' if match.group(2) else 'quo', match.group(1))
|
||||
text = quote_finder.sub(replace_function, text, 1)
|
||||
|
||||
return text
|
||||
|
||||
def smarty_pants(self, text):
|
||||
"""Applies smarty pants to html text"""
|
||||
|
||||
# Try to load smartypants
|
||||
try:
|
||||
import smartypants
|
||||
return smartypants.smartypants(text)
|
||||
except ImportError:
|
||||
pass # this should be logged maybe??? Right now, silently ignored
|
||||
|
||||
return text
|
||||
|
||||
def _apply_filters(self, text, tag):
|
||||
"""Applies the above filters to the text nodes of the HTML doc"""
|
||||
|
||||
# The order of the filters below is important
|
||||
# and should not be changed
|
||||
|
||||
# intial_quotes needs to happen at this point so that
|
||||
# attribute values introduced later on do not get affected
|
||||
text = self.initial_quotes(text)
|
||||
text = self.smarty_pants(text)
|
||||
text = self.amp(text)
|
||||
text = self.caps(text)
|
||||
|
||||
return text
|
||||
|
||||
def filter(self, html_doc, tags=None, session_ignores=None, session_filters=None):
|
||||
"""Produces Typogryfied html for the Pelican static project"""
|
||||
parser = self._HTMLParser(self, html_doc)
|
||||
|
||||
return parser.get_output()
|
||||
Loading…
Add table
Add a link
Reference in a new issue