From cacc6db9a4cbbce979e4ebf90c93c770f6359453 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Tue, 10 May 2011 07:55:30 +0600 Subject: [PATCH 1/2] ReST metadata parsing using docutils. --- pelican/readers.py | 98 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 29 deletions(-) diff --git a/pelican/readers.py b/pelican/readers.py index 7d799f88..11cbde4e 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- try: - from docutils import core + import docutils + import docutils.core + import docutils.io + from docutils.writers.html4css1 import HTMLTranslator # import the directives to have pygments support from pelican import rstdirectives @@ -21,40 +24,78 @@ _METADATA_PROCESSORS = { 'status': unicode.strip, } +def _process_metadata(name, value): + if name in _METADATA_PROCESSORS: + return _METADATA_PROCESSORS[name](value) + return value + class Reader(object): enabled = True + +class _FieldBodyTranslator(HTMLTranslator): + + def astext(self): + return ''.join(self.body) + + def visit_field_body(self, node): + pass + + def depart_field_body(self, node): + pass + + +def render_node_to_html(document, node): + visitor = _FieldBodyTranslator(document) + node.walkabout(visitor) + return visitor.astext() + +def get_metadata(document): + """Return the dict containing document metadata""" + output = {} + for docinfo in document.traverse(docutils.nodes.docinfo): + for element in docinfo.children: + if element.tagname == 'field': # custom fields (e.g. summary) + name_elem, body_elem = element.children + name = name_elem.astext() + value = render_node_to_html(document, body_elem) + else: # standard fields (e.g. address) + name = element.tagname + value = element.astext() + + output[name] = _process_metadata(name, value) + return output + + class RstReader(Reader): - enabled = bool(core) + enabled = bool(docutils) extension = "rst" - def _parse_metadata(self, content): - """Return the dict containing metadata""" - output = {} - for m in re.compile('^:([a-z]+): (.*)\s', re.M).finditer(content): - name, value = m.group(1).lower(), m.group(2) - output[name] = _METADATA_PROCESSORS.get( - name, lambda x:x - )(value) - return output + def _parse_metadata(self, document): + return get_metadata(document) + + def _get_publisher(self, filename): + extra_params = {'initial_header_level': '2'} + pub = docutils.core.Publisher(destination_class=docutils.io.StringOutput) + pub.set_components('standalone', 'restructuredtext', 'html') + pub.process_programmatic_settings(None, extra_params, None) + pub.set_source(source_path=filename) + pub.publish() + return pub def read(self, filename): - """Parse restructured text""" - text = open(filename) - metadata = self._parse_metadata(text) - extra_params = {'input_encoding': 'unicode', - 'initial_header_level': '2'} - rendered_content = core.publish_parts(text, - source_path=filename, - writer_name='html', - settings_overrides=extra_params) - title = rendered_content.get('title') - content = rendered_content.get('body') - if not metadata.has_key('title'): - metadata['title'] = title + """Parses restructured text""" + pub = self._get_publisher(filename) + parts = pub.writer.parts + content = parts.get('body') + + metadata = self._parse_metadata(pub.document) + metadata.setdefault('title', parts.get('title')) + return content, metadata + class MarkdownReader(Reader): enabled = bool(Markdown) extension = "md" @@ -64,13 +105,11 @@ class MarkdownReader(Reader): text = open(filename) md = Markdown(extensions = ['meta', 'codehilite']) content = md.convert(text) - + metadata = {} for name, value in md.Meta.items(): name = name.lower() - metadata[name] = _METADATA_PROCESSORS.get( - name, lambda x:x - )(value[0]) + metadata[name] = _process_metadata(name, value[0]) return content, metadata @@ -85,7 +124,8 @@ class HtmlReader(Reader): for i in self._re.findall(content): key = i.split(':')[0][5:].strip() value = i.split(':')[-1][:-3].strip() - metadata[key.lower()] = value + name = key.lower() + metadata[name] = _process_metadata(name, value) return content, metadata From 6cd425e408bf85cd12ba36c536bc3ff8d78c4fc7 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Wed, 11 May 2011 09:42:37 +0600 Subject: [PATCH 2/2] Basic test for the new rst reader. Locale is also converted to fr_FR.utf-8 (I wasn't able to run tests without this) --- pelican/tests/test_readers.py | 27 +++++++++++++++++++++++++++ samples/content/super_article.rst | 4 +++- samples/pelican.conf.py | 6 +++--- 3 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 pelican/tests/test_readers.py diff --git a/pelican/tests/test_readers.py b/pelican/tests/test_readers.py new file mode 100644 index 00000000..5a255209 --- /dev/null +++ b/pelican/tests/test_readers.py @@ -0,0 +1,27 @@ +# coding: utf-8 +import unittest2 +import os +import datetime +from pelican import readers + +CUR_DIR = os.path.dirname(__file__) +CONTENT_PATH = os.path.join(CUR_DIR, '..', '..', 'samples', 'content') + +def _filename(*args): + return os.path.join(CONTENT_PATH, *args) + + +class RstReaderTest(unittest2.TestCase): + + def test_metadata(self): + reader = readers.RstReader() + content, metadata = reader.read(_filename('super_article.rst')) + expected = { + 'category': 'yeah', + 'author': u'Alexis Métaireau', + 'title': 'This is a super article !', + 'summary': 'Multi-line metadata should be supported\nas well as inline markup.', + 'date': datetime.datetime(2010, 12, 2, 10, 14), + 'tags': ['foo', 'bar', 'foobar'], + } + self.assertDictEqual(metadata, expected) diff --git a/samples/content/super_article.rst b/samples/content/super_article.rst index b3e22051..03273fad 100644 --- a/samples/content/super_article.rst +++ b/samples/content/super_article.rst @@ -5,7 +5,9 @@ This is a super article ! :date: 2010-12-02 10:14 :category: yeah :author: Alexis Métaireau -:summary: This is a simple test +:summary: + Multi-line metadata should be supported + as well as **inline markup**. Some content here ! diff --git a/samples/pelican.conf.py b/samples/pelican.conf.py index a3a07fad..8648e7cb 100755 --- a/samples/pelican.conf.py +++ b/samples/pelican.conf.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- AUTHOR = u'Alexis Métaireau' -SITENAME = u"Alexis' log" +SITENAME = u"Alexis' log" SITEURL = 'http://blog.notmyidea.org' GITHUB_URL = 'http://github.com/ametaireau/' DISQUS_SITENAME = "blog-notmyidea" PDF_GENERATOR = False REVERSE_CATEGORY_ORDER = True -LOCALE = 'fr_FR.utf8' +LOCALE = 'fr_FR.utf-8' DEFAULT_PAGINATION = 2 FEED_RSS = 'feeds/all.rss.xml' @@ -33,6 +33,6 @@ STATIC_PATHS = ["pictures",] # A list of files to copy from the source to the destination FILES_TO_COPY = (('extra/robots.txt', 'robots.txt'),) -# foobar will not be used, because it's not in caps. All configuration keys +# foobar will not be used, because it's not in caps. All configuration keys # have to be in caps foobar = "barbaz"