From cc1988fbda5f191768b9d20ef0f942b572d0bb39 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Thu, 14 Jun 2012 23:08:34 -0400 Subject: [PATCH 01/19] new HTMLReader --- pelican/readers.py | 186 ++++++++---------- tests/content/article_with_keywords.html | 6 + tests/content/article_with_metadata.html | 15 ++ .../article_with_uppercase_metadata.html | 6 + tests/test_readers.py | 38 ++++ 5 files changed, 150 insertions(+), 101 deletions(-) create mode 100644 tests/content/article_with_keywords.html create mode 100644 tests/content/article_with_metadata.html create mode 100644 tests/content/article_with_uppercase_metadata.html diff --git a/pelican/readers.py b/pelican/readers.py index 83cb7e3b..9ce3e3c0 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -129,117 +129,101 @@ class MarkdownReader(Reader): metadata[name] = self.process_metadata(name, value[0]) return content, metadata -""" -class HtmlReader(Reader): - file_extensions = ['html', 'htm'] - _re = re.compile('\<\!\-\-\#\s?[A-z0-9_-]*\s?\:s?[A-z0-9\s_-]*\s?\-\-\>') - - def read(self, filename): - with open(filename) as content: - metadata = {'title': 'unnamed'} - for i in self._re.findall(content): - key = i.split(':')[0][5:].strip() - value = i.split(':')[-1][:-3].strip() - name = key.lower() - metadata[name] = self.process_metadata(name, value) - - return content, metadata -""" - -class PelicanHTMLParser(HTMLParser): - def __init__(self, settings): - HTMLParser.__init__(self) - self.body = '' - self.metadata = {} - self.settings = settings - - self._data_buffer = '' - - self._in_top_level = True - self._in_head = False - self._in_title = False - self._in_body = False - self._in_tags = False - - def handle_starttag(self, tag, attrs): - if tag == 'head' and self._in_top_level: - self._in_top_level = False - self._in_head = True - elif tag == 'title' and self._in_head: - self._in_title = True - self._data_buffer = '' - elif tag == 'body' and self._in_top_level: - self._in_top_level = False - self._in_body = True - self._data_buffer = '' - elif tag == 'meta' and self._in_head: - self._handle_meta_tag(attrs) - - elif self._in_body: - self._data_buffer += self.build_tag(tag, attrs, False) - - def handle_endtag(self, tag): - if tag == 'head': - if self._in_head: - self._in_head = False - self._in_top_level = True - elif tag == 'title': - self._in_title = False - self.metadata['title'] = self._data_buffer - elif tag == 'body': - self.body = self._data_buffer - self._in_body = False - self._in_top_level = True - elif self._in_body: - self._data_buffer += ''.format(cgi.escape(tag)) - - def handle_startendtag(self, tag, attrs): - if tag == 'meta' and self._in_head: - self._handle_meta_tag(attrs) - if self._in_body: - self._data_buffer += self.build_tag(tag, attrs, True) - - def handle_comment(self, data): - if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': - self.metadata['summary'] = self._data_buffer - - def handle_data(self, data): - self._data_buffer += data - - def build_tag(self, tag, attrs, close_tag): - result = '<{}'.format(cgi.escape(tag)) - result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs)) - if close_tag: - return result + ' />' - return result + '>' - - def _handle_meta_tag(self, attrs): - name = self._attr_value(attrs, 'name') - contents = self._attr_value(attrs, 'contents', '') - if name == 'keywords': - if contents: - self.metadata['tags'] = [Tag(unicode(tag), self.settings) for tag in contents.split(',')] - elif name == 'date': - self.metadata['date'] = get_date(contents) - else: - self.metadata[name] = contents - - @classmethod - def _attr_value(cls, attrs, name, default=None): - return next((x[1] for x in attrs if x[0] == name), default) - class HTMLReader(Reader): + """Parses HTML files as input, looking for meta, title, and body tags""" file_extensions = ['htm', 'html'] enabled = True + class _HTMLParser(HTMLParser): + def __init__(self, settings): + HTMLParser.__init__(self) + self.body = '' + self.metadata = {} + self.settings = settings + + self._data_buffer = '' + + self._in_top_level = True + self._in_head = False + self._in_title = False + self._in_body = False + self._in_tags = False + + def handle_starttag(self, tag, attrs): + if tag == 'head' and self._in_top_level: + self._in_top_level = False + self._in_head = True + elif tag == 'title' and self._in_head: + self._in_title = True + self._data_buffer = '' + elif tag == 'body' and self._in_top_level: + self._in_top_level = False + self._in_body = True + self._data_buffer = '' + elif tag == 'meta' and self._in_head: + self._handle_meta_tag(attrs) + + elif self._in_body: + self._data_buffer += self.build_tag(tag, attrs, False) + + def handle_endtag(self, tag): + if tag == 'head': + if self._in_head: + self._in_head = False + self._in_top_level = True + elif tag == 'title': + self._in_title = False + self.metadata['title'] = self._data_buffer + elif tag == 'body': + self.body = self._data_buffer + self._in_body = False + self._in_top_level = True + elif self._in_body: + self._data_buffer += ''.format(cgi.escape(tag)) + + def handle_startendtag(self, tag, attrs): + if tag == 'meta' and self._in_head: + self._handle_meta_tag(attrs) + if self._in_body: + self._data_buffer += self.build_tag(tag, attrs, True) + + def handle_comment(self, data): + if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': + self.metadata['summary'] = self._data_buffer + + def handle_data(self, data): + self._data_buffer += data + + def build_tag(self, tag, attrs, close_tag): + result = '<{}'.format(cgi.escape(tag)) + result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs)) + if close_tag: + return result + ' />' + return result + '>' + + def _handle_meta_tag(self, attrs): + name = self._attr_value(attrs, 'name').lower() + contents = self._attr_value(attrs, 'contents', '') + + if name == 'keywords': + name = 'tags' + self.metadata[name] = contents + + @classmethod + def _attr_value(cls, attrs, name, default=None): + return next((x[1] for x in attrs if x[0] == name), default) + def read(self, filename): """Parse content and metadata of markdown files""" with open(filename) as content: - parser = PelicanHTMLParser(self.settings) + parser = self._HTMLParser(self.settings) parser.feed(content) parser.close() - return parser.body, parser.metadata + metadata = {} + for k in parser.metadata: + metadata[k] = self.process_metadata(k, parser.metadata[k]) + return parser.body, metadata _EXTENSIONS = {} diff --git a/tests/content/article_with_keywords.html b/tests/content/article_with_keywords.html new file mode 100644 index 00000000..c869f514 --- /dev/null +++ b/tests/content/article_with_keywords.html @@ -0,0 +1,6 @@ + + + This is a super article ! + + + diff --git a/tests/content/article_with_metadata.html b/tests/content/article_with_metadata.html new file mode 100644 index 00000000..2bd77241 --- /dev/null +++ b/tests/content/article_with_metadata.html @@ -0,0 +1,15 @@ + + + This is a super article ! + + + + + + + + Multi-line metadata should be supported + as well as inline markup. + + + diff --git a/tests/content/article_with_uppercase_metadata.html b/tests/content/article_with_uppercase_metadata.html new file mode 100644 index 00000000..4fe5a9ee --- /dev/null +++ b/tests/content/article_with_uppercase_metadata.html @@ -0,0 +1,6 @@ + + + This is a super article ! + + + diff --git a/tests/test_readers.py b/tests/test_readers.py index a921cfc2..52887068 100644 --- a/tests/test_readers.py +++ b/tests/test_readers.py @@ -86,3 +86,41 @@ class MdReaderTest(unittest.TestCase): "

This is another markdown test file. Uses the mkd extension.

" self.assertEqual(content, expected) + +class HTMLReaderTest(unittest.TestCase): + + def test_article_with_metadata(self): + reader = readers.HTMLReader({}) + content, metadata = reader.read(_filename('article_with_metadata.html')) + expected = { + 'category': 'yeah', + 'author': u'Alexis Métaireau', + 'title': 'This is a super article !', + 'summary': u''' + Multi-line metadata should be supported + as well as inline markup. + ''', + 'date': datetime.datetime(2010, 12, 2, 10, 14), + 'tags': ['foo', 'bar', 'foobar'], + 'custom_field': 'http://notmyidea.org', + } + + for key, value in expected.items(): + self.assertEquals(value, metadata[key], key) + + def test_article_with_keywords(self): + reader = readers.HTMLReader({}) + content, metadata = reader.read(_filename('article_with_keywords.html')) + expected = { + 'tags': ['foo', 'bar', 'foobar'], + } + + for key, value in expected.items(): + self.assertEquals(value, metadata[key], key) + + def test_article_metadata_key_lowercase(self): + """Keys of metadata should be lowercase.""" + reader = readers.HTMLReader({}) + content, metadata = reader.read(_filename('article_with_uppercase_metadata.html')) + self.assertIn('category', metadata, "Key should be lowercase.") + self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.") From 0373c15e430e168928b645be3b9513f093b97403 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Thu, 14 Jun 2012 23:16:27 -0400 Subject: [PATCH 02/19] include html comments properly in reader --- pelican/readers.py | 2 ++ tests/content/article_with_comments.html | 7 +++++ tests/test_readers.py | 36 ++++++++++++++++++------ 3 files changed, 36 insertions(+), 9 deletions(-) create mode 100644 tests/content/article_with_comments.html diff --git a/pelican/readers.py b/pelican/readers.py index 9ce3e3c0..e3d0e0dd 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -190,6 +190,8 @@ class HTMLReader(Reader): def handle_comment(self, data): if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': self.metadata['summary'] = self._data_buffer + else: + self._data_buffer += ''.format(data) def handle_data(self, data): self._data_buffer += data diff --git a/tests/content/article_with_comments.html b/tests/content/article_with_comments.html new file mode 100644 index 00000000..f222682d --- /dev/null +++ b/tests/content/article_with_comments.html @@ -0,0 +1,7 @@ + + + Summary comment is not included. + + + + diff --git a/tests/test_readers.py b/tests/test_readers.py index 52887068..b3e30bfc 100644 --- a/tests/test_readers.py +++ b/tests/test_readers.py @@ -88,6 +88,33 @@ class MdReaderTest(unittest.TestCase): self.assertEqual(content, expected) class HTMLReaderTest(unittest.TestCase): + def test_article_with_comments(self): + reader = readers.HTMLReader({}) + content, metadata = reader.read(_filename('article_with_comments.html')) + expected = { + 'summary': ''' + Summary comment is not included. + ''', + } + + for key, value in expected.items(): + self.assertEquals(value, metadata[key], key) + + self.assertEquals(''' + Summary comment is not included. + + + ''', content) + + def test_article_with_keywords(self): + reader = readers.HTMLReader({}) + content, metadata = reader.read(_filename('article_with_keywords.html')) + expected = { + 'tags': ['foo', 'bar', 'foobar'], + } + + for key, value in expected.items(): + self.assertEquals(value, metadata[key], key) def test_article_with_metadata(self): reader = readers.HTMLReader({}) @@ -108,15 +135,6 @@ class HTMLReaderTest(unittest.TestCase): for key, value in expected.items(): self.assertEquals(value, metadata[key], key) - def test_article_with_keywords(self): - reader = readers.HTMLReader({}) - content, metadata = reader.read(_filename('article_with_keywords.html')) - expected = { - 'tags': ['foo', 'bar', 'foobar'], - } - - for key, value in expected.items(): - self.assertEquals(value, metadata[key], key) def test_article_metadata_key_lowercase(self): """Keys of metadata should be lowercase.""" From c608d39aa40b8304f4e2e241564796201e582da4 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Wed, 20 Jun 2012 19:52:17 -0400 Subject: [PATCH 03/19] re-import htmlparser --- pelican/readers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pelican/readers.py b/pelican/readers.py index 870c11c8..1916fa1e 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -15,6 +15,8 @@ except ImportError: Markdown = False # NOQA import re +from htmlparser import HTMLParser + from pelican.contents import Category, Tag, Author from pelican.utils import get_date, open From caa4442abb145d419a3120c7339ad7ecf91ac56c Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Wed, 20 Jun 2012 19:59:32 -0400 Subject: [PATCH 04/19] re-import cgi. properly turn utils.open into a context manager --- pelican/readers.py | 3 ++- pelican/utils.py | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pelican/readers.py b/pelican/readers.py index 1916fa1e..d05ab40f 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -15,7 +15,8 @@ except ImportError: Markdown = False # NOQA import re -from htmlparser import HTMLParser +import cgi +from HTMLParser import HTMLParser from pelican.contents import Category, Tag, Author from pelican.utils import get_date, open diff --git a/pelican/utils.py b/pelican/utils.py index 0940bf72..088a8faa 100644 --- a/pelican/utils.py +++ b/pelican/utils.py @@ -34,10 +34,15 @@ def get_date(string): raise ValueError("'%s' is not a valid date" % string) -def open(filename): +class open(object): """Open a file and return it's content""" - return _open(filename, encoding='utf-8').read() + def __init__(self, filename): + self.filename = filename + def __enter__(self): + return _open(self.filename, encoding='utf-8').read() + def __exit__(self, exc_type, exc_value, traceback): + pass def slugify(value): """ From 56800a1d43ff9e07659d0f5ad570a9004d44cd74 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Wed, 20 Jun 2012 20:02:41 -0400 Subject: [PATCH 05/19] fix failing test with new open context manager --- pelican/readers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pelican/readers.py b/pelican/readers.py index d05ab40f..1d06bd6d 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -119,9 +119,9 @@ class MarkdownReader(Reader): def read(self, filename): """Parse content and metadata of markdown files""" - text = open(filename) - md = Markdown(extensions=set(self.extensions + ['meta'])) - content = md.convert(text) + with open(filename) as text: + md = Markdown(extensions=set(self.extensions + ['meta'])) + content = md.convert(text) metadata = {} for name, value in md.Meta.items(): From c0578eb9ab77c7be4a045f58a7844222ccbe6b95 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Wed, 20 Jun 2012 23:19:06 -0400 Subject: [PATCH 06/19] handle escaped chars in html properly --- pelican/readers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pelican/readers.py b/pelican/readers.py index 1d06bd6d..08ef4cf8 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -196,6 +196,12 @@ class HTMLReader(Reader): def handle_data(self, data): self._data_buffer += data + def handle_entityref(self, data): + self._data_buffer += '&{};'.format(data) + + def handle_charref(self, data): + self._data_buffer += '&{};'.format(data) + def build_tag(self, tag, attrs, close_tag): result = '<{}'.format(cgi.escape(tag)) result += ''.join((' {}="{}"'.format(cgi.escape(k), cgi.escape(v)) for k,v in attrs)) From 036728a194695d463123c714954c25a3d6a826d5 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Thu, 21 Jun 2012 09:05:27 -0400 Subject: [PATCH 07/19] properly write out charref's --- pelican/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pelican/readers.py b/pelican/readers.py index 08ef4cf8..93549d96 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -200,7 +200,7 @@ class HTMLReader(Reader): self._data_buffer += '&{};'.format(data) def handle_charref(self, data): - self._data_buffer += '&{};'.format(data) + self._data_buffer += '&#{};'.format(data) def build_tag(self, tag, attrs, close_tag): result = '<{}'.format(cgi.escape(tag)) From 847a6fe3cee7f05e36679d6b12fafaf58cfc1045 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Thu, 21 Jun 2012 09:12:38 -0400 Subject: [PATCH 08/19] change 'markdown' to HTML in the comments --- pelican/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pelican/readers.py b/pelican/readers.py index 93549d96..9d200599 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -222,7 +222,7 @@ class HTMLReader(Reader): return next((x[1] for x in attrs if x[0] == name), default) def read(self, filename): - """Parse content and metadata of markdown files""" + """Parse content and metadata of HTML files""" with open(filename) as content: parser = self._HTMLParser(self.settings) parser.feed(content) From a86d5fda71a2d2ce7295cb385641331b139bf361 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Mon, 9 Jul 2012 22:43:51 -0400 Subject: [PATCH 09/19] add documentation for html reader --- docs/getting_started.rst | 30 ++++++++++++++++++++++++++++++ docs/internals.rst | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 93d578a0..d60cce83 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -154,6 +154,36 @@ Markdown posts should follow this pattern:: This is the content of my super blog post. +Lastly, you can use Vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican +interprets the HTML in a very straightforward manner, reading meta data out +of ``meta`` tags, the title out of the ``title`` tag, and the body out of the +``body`` tag:: + + + + My super title + + + + + + + This is the content of my super blog post. + + Content continues down here. + + + +With HTML, there are two simple exceptions to the standard metadata. First, +``tags`` can be specified either with the ``tags`` metadata, as is standard in +Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can +be used interchangeably. The second note is that summaries are done differently +in HTML posts. Either a ``summary`` metadata tag can be supplied, or, as seen +above, you can place an HTML comment, ````, that +Pelican will recognize. Everything before the comment will be treated as a +summary. The content of the post will contain everything in the body tag, with +the special comment stripped out. + Note that, aside from the title, none of this metadata is mandatory: if the date is not specified, Pelican will rely on the file's "mtime" timestamp, and the category can be determined by the directory in which the file resides. For diff --git a/docs/internals.rst b/docs/internals.rst index 6b6f991f..a94d1c56 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -23,7 +23,7 @@ The logic is separated into different classes and concepts: on. Since those operations are commonly used, the object is created once and then passed to the generators. -* **Readers** are used to read from various formats (Markdown and +* **Readers** are used to read from various formats (HTML, Markdown and reStructuredText for now, but the system is extensible). Given a file, they return metadata (author, tags, category, etc.) and content (HTML-formatted). From 4ec6cefe1db92c0bc6cea9a95c810e3f5b455865 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Mon, 9 Jul 2012 22:45:34 -0400 Subject: [PATCH 10/19] fix grammar --- docs/getting_started.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index d60cce83..5e553815 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -154,7 +154,7 @@ Markdown posts should follow this pattern:: This is the content of my super blog post. -Lastly, you can use Vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican +Lastly, you can use vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican interprets the HTML in a very straightforward manner, reading meta data out of ``meta`` tags, the title out of the ``title`` tag, and the body out of the ``body`` tag:: From 357f3a3da211cffeda1501e1c8fb54dc069694f6 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Thu, 21 Jun 2012 09:05:27 -0400 Subject: [PATCH 11/19] properly write out charref's --- pelican/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pelican/readers.py b/pelican/readers.py index 6fe8e894..de3df66f 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -235,7 +235,7 @@ class HTMLReader(Reader): self._data_buffer += '&{};'.format(data) def handle_charref(self, data): - self._data_buffer += '&{};'.format(data) + self._data_buffer += '&#{};'.format(data) def build_tag(self, tag, attrs, close_tag): result = '<{}'.format(cgi.escape(tag)) From 5f639b9a3b79213d5fd631216888af71990723c2 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Mon, 28 Jan 2013 21:46:54 -0500 Subject: [PATCH 12/19] git rebase master --- pelican/readers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pelican/readers.py b/pelican/readers.py index de3df66f..60fabe82 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -257,8 +257,8 @@ class HTMLReader(Reader): return next((x[1] for x in attrs if x[0] == name), default) def read(self, filename): - """Parse content and metadata of markdown files""" - with pelican_open(filename) as content: + """Parse content and metadata of HTML files""" + with open(filename) as content: parser = self._HTMLParser(self.settings) parser.feed(content) parser.close() From bf6f16e3839be680296c8325922606410bb86d8a Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Mon, 9 Jul 2012 22:43:51 -0400 Subject: [PATCH 13/19] add documentation for html reader --- docs/getting_started.rst | 36 ++++++++++++++++++++++++++++++++++++ docs/internals.rst | 4 ++-- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 0952c7d9..7592a5ef 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -190,6 +190,42 @@ syntax for Markdown posts should follow this pattern:: This is the content of my super blog post. +Lastly, you can use Vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican +interprets the HTML in a very straightforward manner, reading meta data out +of ``meta`` tags, the title out of the ``title`` tag, and the body out of the +``body`` tag:: + + + + My super title + + + + + + + This is the content of my super blog post. + + Content continues down here. + + + +With HTML, there are two simple exceptions to the standard metadata. First, +``tags`` can be specified either with the ``tags`` metadata, as is standard in +Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can +be used interchangeably. The second note is that summaries are done differently +in HTML posts. Either a ``summary`` metadata tag can be supplied, or, as seen +above, you can place an HTML comment, ````, that +Pelican will recognize. Everything before the comment will be treated as a +summary. The content of the post will contain everything in the body tag, with +the special comment stripped out. + +Note that, aside from the title, none of this metadata is mandatory: if the date +is not specified, Pelican will rely on the file's "mtime" timestamp, and the +category can be determined by the directory in which the file resides. For +example, a file located at ``python/foobar/myfoobar.rst`` will have a category of +``foobar``. + Note that, aside from the title, none of this metadata is mandatory: if the date is not specified, Pelican can rely on the file's "mtime" timestamp through the ``DEFAULT_DATE`` setting, and the category can be determined by the diff --git a/docs/internals.rst b/docs/internals.rst index cadd300b..704122ba 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -23,8 +23,8 @@ The logic is separated into different classes and concepts: on. Since those operations are commonly used, the object is created once and then passed to the generators. -* **Readers** are used to read from various formats (AsciiDoc, Markdown and - reStructuredText for now, but the system is extensible). Given a file, they +* **Readers** are used to read from various formats (AsciiDoc, HTML, Markdown and + reStructuredText for now, but the system is extensible). Given a file, they return metadata (author, tags, category, etc.) and content (HTML-formatted). * **Generators** generate the different outputs. For instance, Pelican comes with From e6a4fe3fc40f003ad9ecba183f12a2fdc6a5adeb Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Mon, 9 Jul 2012 22:45:34 -0400 Subject: [PATCH 14/19] fix grammar --- docs/getting_started.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 7592a5ef..c7f2e257 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -190,7 +190,7 @@ syntax for Markdown posts should follow this pattern:: This is the content of my super blog post. -Lastly, you can use Vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican +Lastly, you can use vanilla HTML (files ending in ``.htm`` and ``.html``). Pelican interprets the HTML in a very straightforward manner, reading meta data out of ``meta`` tags, the title out of the ``title`` tag, and the body out of the ``body`` tag:: From 7b59b34a73560b3eeb6c737a5e8ce2e5b9c4c36b Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Mon, 28 Jan 2013 22:11:06 -0500 Subject: [PATCH 15/19] get tests passing --- pelican/readers.py | 10 ++++----- tests/content/article_with_comments.html | 7 ++++--- tests/content/article_with_metadata.html | 2 +- tests/test_readers.py | 26 +++++++----------------- 4 files changed, 17 insertions(+), 28 deletions(-) diff --git a/pelican/readers.py b/pelican/readers.py index 60fabe82..9b8be192 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -223,10 +223,10 @@ class HTMLReader(Reader): self._data_buffer += self.build_tag(tag, attrs, True) def handle_comment(self, data): - if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': - self.metadata['summary'] = self._data_buffer - else: - self._data_buffer += ''.format(data) + # if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': + # self.metadata['summary'] = self._data_buffer + # else: + self._data_buffer += ''.format(data) def handle_data(self, data): self._data_buffer += data @@ -258,7 +258,7 @@ class HTMLReader(Reader): def read(self, filename): """Parse content and metadata of HTML files""" - with open(filename) as content: + with pelican_open(filename) as content: parser = self._HTMLParser(self.settings) parser.feed(content) parser.close() diff --git a/tests/content/article_with_comments.html b/tests/content/article_with_comments.html index f222682d..289e4a66 100644 --- a/tests/content/article_with_comments.html +++ b/tests/content/article_with_comments.html @@ -1,7 +1,8 @@ + + - Summary comment is not included. - - + Body content + diff --git a/tests/content/article_with_metadata.html b/tests/content/article_with_metadata.html index 2bd77241..b108ac8a 100644 --- a/tests/content/article_with_metadata.html +++ b/tests/content/article_with_metadata.html @@ -5,11 +5,11 @@ + Multi-line metadata should be supported as well as inline markup. - diff --git a/tests/test_readers.py b/tests/test_readers.py index 8cee4c1a..49130669 100644 --- a/tests/test_readers.py +++ b/tests/test_readers.py @@ -264,25 +264,16 @@ class AdReaderTest(unittest.TestCase): class HTMLReaderTest(unittest.TestCase): def test_article_with_comments(self): reader = readers.HTMLReader({}) - content, metadata = reader.read(_filename('article_with_comments.html')) - expected = { - 'summary': ''' - Summary comment is not included. - ''', - } - - for key, value in expected.items(): - self.assertEquals(value, metadata[key], key) + content, metadata = reader.read(_path('article_with_comments.html')) self.assertEquals(''' - Summary comment is not included. - - + Body content + ''', content) def test_article_with_keywords(self): reader = readers.HTMLReader({}) - content, metadata = reader.read(_filename('article_with_keywords.html')) + content, metadata = reader.read(_path('article_with_keywords.html')) expected = { 'tags': ['foo', 'bar', 'foobar'], } @@ -292,15 +283,12 @@ class HTMLReaderTest(unittest.TestCase): def test_article_with_metadata(self): reader = readers.HTMLReader({}) - content, metadata = reader.read(_filename('article_with_metadata.html')) + content, metadata = reader.read(_path('article_with_metadata.html')) expected = { 'category': 'yeah', 'author': u'Alexis Métaireau', 'title': 'This is a super article !', - 'summary': u''' - Multi-line metadata should be supported - as well as inline markup. - ''', + 'summary': u'''Summary and stuff''', 'date': datetime.datetime(2010, 12, 2, 10, 14), 'tags': ['foo', 'bar', 'foobar'], 'custom_field': 'http://notmyidea.org', @@ -313,6 +301,6 @@ class HTMLReaderTest(unittest.TestCase): def test_article_metadata_key_lowercase(self): """Keys of metadata should be lowercase.""" reader = readers.HTMLReader({}) - content, metadata = reader.read(_filename('article_with_uppercase_metadata.html')) + content, metadata = reader.read(_path('article_with_uppercase_metadata.html')) self.assertIn('category', metadata, "Key should be lowercase.") self.assertEquals('Yeah', metadata.get('category'), "Value keeps cases.") From 2a3d7d031949e0b3ae693b8867350d3e40bb7f13 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Mon, 28 Jan 2013 22:21:45 -0500 Subject: [PATCH 16/19] fix python3 support --- pelican/readers.py | 5 ++++- tests/test_readers.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pelican/readers.py b/pelican/readers.py index 9b8be192..8667a299 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -26,7 +26,10 @@ except ImportError: import re import cgi -from HTMLParser import HTMLParser +try: + from html.parser import HTMLParser +except ImportError: + from HTMLParser import HTMLParser from pelican.contents import Category, Tag, Author from pelican.utils import get_date, pelican_open diff --git a/tests/test_readers.py b/tests/test_readers.py index 49130669..39bc2067 100644 --- a/tests/test_readers.py +++ b/tests/test_readers.py @@ -286,9 +286,9 @@ class HTMLReaderTest(unittest.TestCase): content, metadata = reader.read(_path('article_with_metadata.html')) expected = { 'category': 'yeah', - 'author': u'Alexis Métaireau', + 'author': 'Alexis Métaireau', 'title': 'This is a super article !', - 'summary': u'''Summary and stuff''', + 'summary': 'Summary and stuff', 'date': datetime.datetime(2010, 12, 2, 10, 14), 'tags': ['foo', 'bar', 'foobar'], 'custom_field': 'http://notmyidea.org', From d5bfec3a8b7ee4eef9f7359b93a8ce86cf2d701a Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Mon, 28 Jan 2013 22:25:15 -0500 Subject: [PATCH 17/19] update documentation and remove commented out code --- docs/getting_started.rst | 12 +++--------- pelican/readers.py | 3 --- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index c7f2e257..7155efce 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -202,23 +202,17 @@ of ``meta`` tags, the title out of the ``title`` tag, and the body out of the + This is the content of my super blog post. - - Content continues down here. -With HTML, there are two simple exceptions to the standard metadata. First, +With HTML, there is one simple exception to the standard metadata. ``tags`` can be specified either with the ``tags`` metadata, as is standard in Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can -be used interchangeably. The second note is that summaries are done differently -in HTML posts. Either a ``summary`` metadata tag can be supplied, or, as seen -above, you can place an HTML comment, ````, that -Pelican will recognize. Everything before the comment will be treated as a -summary. The content of the post will contain everything in the body tag, with -the special comment stripped out. +be used interchangeably. Note that, aside from the title, none of this metadata is mandatory: if the date is not specified, Pelican will rely on the file's "mtime" timestamp, and the diff --git a/pelican/readers.py b/pelican/readers.py index 8667a299..ecb49f9c 100644 --- a/pelican/readers.py +++ b/pelican/readers.py @@ -226,9 +226,6 @@ class HTMLReader(Reader): self._data_buffer += self.build_tag(tag, attrs, True) def handle_comment(self, data): - # if self._in_body and data.strip() == 'PELICAN_END_SUMMARY': - # self.metadata['summary'] = self._data_buffer - # else: self._data_buffer += ''.format(data) def handle_data(self, data): From 8ba6a4d19d5d31d7b183332d812236b4b69d8dcd Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Sat, 9 Feb 2013 09:27:45 -0500 Subject: [PATCH 18/19] fix documentation --- docs/getting_started.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 7155efce..4aeddbfb 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -215,10 +215,10 @@ Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can be used interchangeably. Note that, aside from the title, none of this metadata is mandatory: if the date -is not specified, Pelican will rely on the file's "mtime" timestamp, and the -category can be determined by the directory in which the file resides. For -example, a file located at ``python/foobar/myfoobar.rst`` will have a category of -``foobar``. +is not specified and DEFAULT_DATE is None, Pelican will rely on the file's +"mtime" timestamp, and the category can be determined by the directory in which +the file resides. For example, a file located at ``python/foobar/myfoobar.rst`` +will have a category of ``foobar``. Note that, aside from the title, none of this metadata is mandatory: if the date is not specified, Pelican can rely on the file's "mtime" timestamp through From 5f5b300ba5c2703c015ca5b0db540cb824cf2bd6 Mon Sep 17 00:00:00 2001 From: dave mankoff Date: Sat, 9 Feb 2013 09:51:02 -0500 Subject: [PATCH 19/19] fix documentation --- docs/getting_started.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 4aeddbfb..afea8c01 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -215,7 +215,7 @@ Pelican, or with the ``keywords`` metadata, as is standard in HTML. The two can be used interchangeably. Note that, aside from the title, none of this metadata is mandatory: if the date -is not specified and DEFAULT_DATE is None, Pelican will rely on the file's +is not specified and DEFAULT_DATE is 'fs', Pelican will rely on the file's "mtime" timestamp, and the category can be determined by the directory in which the file resides. For example, a file located at ``python/foobar/myfoobar.rst`` will have a category of ``foobar``.