From 9fb5969c59ca47911679b1c8f994ddf613cec522 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Sun, 10 Jun 2012 17:58:05 -0400 Subject: [PATCH 1/6] Allow settings to specify a summary length, optionally allowing unlimited summary length --- pelican/contents.py | 4 +++- pelican/settings.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pelican/contents.py b/pelican/contents.py index f5f3a1dc..b8bb0993 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -139,7 +139,9 @@ class Page(object): if hasattr(self, '_summary'): return self._summary else: - return truncate_html_words(self.content, 50) + if self.settings['SUMMARY_MAX_LENGTH']: + return truncate_html_words(self.content, self.settings['SUMMARY_MAX_LENGTH']) + return self.content def _set_summary(self, summary): """Dummy function""" diff --git a/pelican/settings.py b/pelican/settings.py index 4da66989..a8c8bea4 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -68,6 +68,7 @@ _DEFAULT_CONFIG = {'PATH': '.', 'ARTICLE_PERMALINK_STRUCTURE': '', 'TYPOGRIFY': False, 'LESS_GENERATOR': False, + 'SUMARY_MAX_LENGTH': 50, } From 876c7f509392d6c245a81e26fd5fd2a81851281f Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Sun, 10 Jun 2012 18:26:53 -0400 Subject: [PATCH 2/6] turn utils.open into actual context manager so as to better handle encoding warnings --- pelican/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pelican/utils.py b/pelican/utils.py index d4e34842..db15a343 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import contextlib import os import re import pytz @@ -32,10 +33,10 @@ def get_date(string): pass raise ValueError("'%s' is not a valid date" % string) - +@contextlib.contextmanager def open(filename): """Open a file and return it's content""" - return _open(filename, encoding='utf-8').read() + yield _open(filename, encoding='utf-8').read() def slugify(value): From c6d1de14f3db63705b35fbe3fd5db3701ea962a4 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Sun, 10 Jun 2012 18:27:38 -0400 Subject: [PATCH 3/6] better html parser --- pelican/readers.py | 101 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 2 deletions(-) diff --git a/pelican/readers.py b/pelican/readers.py index 83565918..83cb7e3b 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -13,8 +13,11 @@ try: from markdown import Markdown except ImportError: Markdown = False # NOQA +import cgi +from HTMLParser import HTMLParser import re + from pelican.contents import Category, Tag, Author from pelican.utils import get_date, open @@ -126,13 +129,12 @@ class MarkdownReader(Reader): metadata[name] = self.process_metadata(name, value[0]) return content, metadata - +""" class HtmlReader(Reader): file_extensions = ['html', 'htm'] _re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>') def read(self, filename): - """Parse content and metadata of (x)HTML files""" with open(filename) as content: metadata = {'title': 'unnamed'} for i in self._re.findall(content): @@ -142,6 +144,101 @@ class HtmlReader(Reader): metadata[name] = self.process_metadata(name, value) return content, metadata +""" + +class PelicanHTMLParser(HTMLParser): + def __init__(self, settings): + HTMLParser.__init__(self) + self.body = '' + self.metadata = {} + self.settings = settings + + self._data_buffer = '' + + self._in_top_level = True + self._in_head = False + self._in_title = False + self._in_body = False + self._in_tags = False + + def handle_starttag(self, tag, attrs): + if tag == 'head' and self._in_top_level: + self._in_top_level = False + self._in_head = True + elif tag == 'title' and self._in_head: + self._in_title = True + self._data_buffer = '' + elif tag == 'body' and self._in_top_level: + self._in_top_level = False + self._in_body = True + self._data_buffer = '' + elif tag == 'meta' and self._in_head: + self._handle_meta_tag(attrs) + + elif self._in_body: + self._data_buffer += self.build_tag(tag, attrs, False) + + def handle_endtag(self, tag): + if tag == 'head': + if self._in_head: + self._in_head = False + self._in_top_level = True + elif tag == 'title': + self._in_title = False + self.metadata['title'] = self._data_buffer + elif tag == 'body': + self.body = self._data_buffer + self._in_body = False + self._in_top_level = True + elif self._in_body: + self._data_buffer += ''.format(cgi.escape(tag)) + + def handle_startendtag(self, tag, attrs): + if tag == 'meta' and self._in_head: + self._handle_meta_tag(attrs) + if self._in_body: + self._data_buffer += self.build_tag(tag, attrs, True) + + def handle_comment(self, data): + if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': + self.metadata['summary'] = self._data_buffer + + def handle_data(self, data): + self._data_buffer += data + + def build_tag(self, tag, attrs, close_tag): + result = '<{}'.format(cgi.escape(tag)) + result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs)) + if close_tag: + return result + ' />' + return result + '>' + + def _handle_meta_tag(self, attrs): + name = self._attr_value(attrs, 'name') + contents = self._attr_value(attrs, 'contents', '') + if name == 'keywords': + if contents: + self.metadata['tags'] = [Tag(unicode(tag), self.settings) for tag in contents.split(',')] + elif name == 'date': + self.metadata['date'] = get_date(contents) + else: + self.metadata[name] = contents + + @classmethod + def _attr_value(cls, attrs, name, default=None): + return next((x[1] for x in attrs if x[0] == name), default) + +class HTMLReader(Reader): + file_extensions = ['htm', 'html'] + enabled = True + + def read(self, filename): + """Parse content and metadata of markdown files""" + with open(filename) as content: + parser = PelicanHTMLParser(self.settings) + parser.feed(content) + parser.close() + return parser.body, parser.metadata _EXTENSIONS = {} From c6f1d0aadaa5e1dd71ad2a6897169f9da4458c2a Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Mon, 11 Jun 2012 08:39:13 -0400 Subject: [PATCH 4/6] fix SUMMARY_MAX_LENGTH, document it, and test it --- docs/settings.rst | 6 ++++++ pelican/settings.py | 2 +- tests/test_contents.py | 38 ++++++++++++++++++++++++++++++-------- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/docs/settings.rst b/docs/settings.rst index 85e9f0c3..a26c37dd 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -90,6 +90,12 @@ Setting name (default value) What doe index pages for collections of content e.g. tags and category index pages. `PAGINATED_DIRECT_TEMPLATES` (``('index',)``) Provides the direct templates that should be paginated. +`SUMMARY_MAX_LENGTH` (``50``) When creating a short summary of an article, this will + be the default length in words of the text created. + This only applies if your content does not otherwise + specify a summary. Setting to None will cause the summary + to be a copy of the original content. + ===================================================================== ===================================================================== .. [#] Default is the system locale. diff --git a/pelican/settings.py b/pelican/settings.py index a8c8bea4..d2a39cd9 100644 --- a/pelican/settings.py +++ b/pelican/settings.py @@ -68,7 +68,7 @@ _DEFAULT_CONFIG = {'PATH': '.', 'ARTICLE_PERMALINK_STRUCTURE': '', 'TYPOGRIFY': False, 'LESS_GENERATOR': False, - 'SUMARY_MAX_LENGTH': 50, + 'SUMMARY_MAX_LENGTH': 50, } diff --git a/tests/test_contents.py b/tests/test_contents.py index c6ef29a8..e7c9ad01 100644 --- a/tests/test_contents.py +++ b/tests/test_contents.py @@ -4,6 +4,7 @@ from .support import unittest from pelican.contents import Page from pelican.settings import _DEFAULT_CONFIG +from pelican.utils import truncate_html_words from jinja2.utils import generate_lorem_ipsum @@ -48,6 +49,20 @@ class TestPage(unittest.TestCase): page = Page(**self.page_kwargs) self.assertEqual(page.summary, TEST_SUMMARY) + def test_summary_max_length(self): + """If a :SUMMARY_MAX_LENGTH: is set, and there is no other summary, generated summary + should not exceed the given length.""" + page_kwargs = self._copy_page_kwargs() + settings = _DEFAULT_CONFIG.copy() + page_kwargs['settings'] = settings + del page_kwargs['metadata']['summary'] + settings['SUMMARY_MAX_LENGTH'] = None + page = Page(**page_kwargs) + self.assertEqual(page.summary, TEST_CONTENT) + settings['SUMMARY_MAX_LENGTH'] = 10 + page = Page(**page_kwargs) + self.assertEqual(page.summary, truncate_html_words(TEST_CONTENT, 10)) + def test_slug(self): """If a title is given, it should be used to generate the slug.""" page = Page(**self.page_kwargs) @@ -83,14 +98,9 @@ class TestPage(unittest.TestCase): from datetime import datetime from sys import platform dt = datetime(2015, 9, 13) - # make a deep copy of page_kawgs - page_kwargs = dict([(key, self.page_kwargs[key]) for key in - self.page_kwargs]) - for key in page_kwargs: - if not isinstance(page_kwargs[key], dict): - break - page_kwargs[key] = dict([(subkey, page_kwargs[key][subkey]) - for subkey in page_kwargs[key]]) + + page_kwargs = self._copy_page_kwargs() + # set its date to dt page_kwargs['metadata']['date'] = dt page = Page(**page_kwargs) @@ -124,3 +134,15 @@ class TestPage(unittest.TestCase): # Until we find some other method to test this functionality, we # will simply skip this test. unittest.skip("There is no locale %s in this system." % locale) + + def _copy_page_kwargs(self): + # make a deep copy of page_kwargs + page_kwargs = dict([(key, self.page_kwargs[key]) for key in + self.page_kwargs]) + for key in page_kwargs: + if not isinstance(page_kwargs[key], dict): + break + page_kwargs[key] = dict([(subkey, page_kwargs[key][subkey]) + for subkey in page_kwargs[key]]) + + return page_kwargs From 1c708a70ba7f806ca2cf373b1f3c47d74c52c086 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Mon, 11 Jun 2012 09:00:36 -0400 Subject: [PATCH 5/6] Revert "better html parser" This reverts commit c6d1de14f3db63705b35fbe3fd5db3701ea962a4. --- pelican/readers.py | 101 +-------------------------------------------- 1 file changed, 2 insertions(+), 99 deletions(-) diff --git a/pelican/readers.py b/pelican/readers.py index 83cb7e3b..83565918 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -13,11 +13,8 @@ try: from markdown import Markdown except ImportError: Markdown = False # NOQA -import cgi -from HTMLParser import HTMLParser import re - from pelican.contents import Category, Tag, Author from pelican.utils import get_date, open @@ -129,12 +126,13 @@ class MarkdownReader(Reader): metadata[name] = self.process_metadata(name, value[0]) return content, metadata -""" + class HtmlReader(Reader): file_extensions = ['html', 'htm'] _re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>') def read(self, filename): + """Parse content and metadata of (x)HTML files""" with open(filename) as content: metadata = {'title': 'unnamed'} for i in self._re.findall(content): @@ -144,101 +142,6 @@ class HtmlReader(Reader): metadata[name] = self.process_metadata(name, value) return content, metadata -""" - -class PelicanHTMLParser(HTMLParser): - def __init__(self, settings): - HTMLParser.__init__(self) - self.body = '' - self.metadata = {} - self.settings = settings - - self._data_buffer = '' - - self._in_top_level = True - self._in_head = False - self._in_title = False - self._in_body = False - self._in_tags = False - - def handle_starttag(self, tag, attrs): - if tag == 'head' and self._in_top_level: - self._in_top_level = False - self._in_head = True - elif tag == 'title' and self._in_head: - self._in_title = True - self._data_buffer = '' - elif tag == 'body' and self._in_top_level: - self._in_top_level = False - self._in_body = True - self._data_buffer = '' - elif tag == 'meta' and self._in_head: - self._handle_meta_tag(attrs) - - elif self._in_body: - self._data_buffer += self.build_tag(tag, attrs, False) - - def handle_endtag(self, tag): - if tag == 'head': - if self._in_head: - self._in_head = False - self._in_top_level = True - elif tag == 'title': - self._in_title = False - self.metadata['title'] = self._data_buffer - elif tag == 'body': - self.body = self._data_buffer - self._in_body = False - self._in_top_level = True - elif self._in_body: - self._data_buffer += ''.format(cgi.escape(tag)) - - def handle_startendtag(self, tag, attrs): - if tag == 'meta' and self._in_head: - self._handle_meta_tag(attrs) - if self._in_body: - self._data_buffer += self.build_tag(tag, attrs, True) - - def handle_comment(self, data): - if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': - self.metadata['summary'] = self._data_buffer - - def handle_data(self, data): - self._data_buffer += data - - def build_tag(self, tag, attrs, close_tag): - result = '<{}'.format(cgi.escape(tag)) - result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs)) - if close_tag: - return result + ' />' - return result + '>' - - def _handle_meta_tag(self, attrs): - name = self._attr_value(attrs, 'name') - contents = self._attr_value(attrs, 'contents', '') - if name == 'keywords': - if contents: - self.metadata['tags'] = [Tag(unicode(tag), self.settings) for tag in contents.split(',')] - elif name == 'date': - self.metadata['date'] = get_date(contents) - else: - self.metadata[name] = contents - - @classmethod - def _attr_value(cls, attrs, name, default=None): - return next((x[1] for x in attrs if x[0] == name), default) - -class HTMLReader(Reader): - file_extensions = ['htm', 'html'] - enabled = True - - def read(self, filename): - """Parse content and metadata of markdown files""" - with open(filename) as content: - parser = PelicanHTMLParser(self.settings) - parser.feed(content) - parser.close() - return parser.body, parser.metadata _EXTENSIONS = {} From d9dba3864486dd3949e69976619bc993aaee387a Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Mon, 11 Jun 2012 09:00:57 -0400 Subject: [PATCH 6/6] Revert "turn utils.open into actual context manager so as to better handle encoding warnings" This reverts commit 876c7f509392d6c245a81e26fd5fd2a81851281f. --- pelican/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pelican/utils.py b/pelican/utils.py index db15a343..d4e34842 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import contextlib import os import re import pytz @@ -33,10 +32,10 @@ def get_date(string): pass raise ValueError("'%s' is not a valid date" % string) -@contextlib.contextmanager + def open(filename): """Open a file and return it's content""" - yield _open(filename, encoding='utf-8').read() + return _open(filename, encoding='utf-8').read() def slugify(value):