From ba8ed9fb1806e5f45f30237a13f472034f6b6697 Mon Sep 17 00:00:00 2001 From: sam Date: Sun, 13 May 2012 23:37:33 +0200 Subject: [PATCH] Added strip raw option to wordpress xml importer --- pelican/tools/pelican_import.py | 27 +- tests/content/wordpressexport.xml | 578 ++++++++++++++++++++++++++++++ tests/support.py | 88 ++++- tests/test_importer.py | 43 +++ 4 files changed, 728 insertions(+), 8 deletions(-) create mode 100644 tests/content/wordpressexport.xml create mode 100644 tests/test_importer.py diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 050b1010..f3e9bb48 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -25,8 +25,14 @@ def wp2fields(xml): items = soup.rss.channel.findAll('item') for item in items: + if item.fetch('wp:status')[0].contents[0] == "publish": - title = item.title.contents[0] + + try: + title = item.title.contents[0] + except IndexError: + continue + content = item.fetch('content:encoded')[0].contents[0] filename = item.fetch('wp:post_name')[0].contents[0] @@ -197,7 +203,7 @@ def build_markdown_header(title, date, author, categories, tags): header += '\n' return header -def fields2pelican(fields, out_markup, output_path, dircat=False): +def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=False): for title, content, filename, date, author, categories, tags, in_markup in fields: if (in_markup == "markdown") or (out_markup == "markdown") : ext = '.md' @@ -230,10 +236,13 @@ def fields2pelican(fields, out_markup, output_path, dircat=False): paragraphs = [u'

{}

'.format(p) for p in paragraphs] new_content = ''.join(paragraphs) - fp.write(content) + fp.write(new_content) - cmd = 'pandoc --normalize --reference-links --from=html --to={0} -o "{1}" "{2}"'.format( - out_markup, out_filename, html_filename) + + parse_raw = '--parse-raw' if not strip_raw else '' + cmd = ('pandoc --normalize --reference-links {0} --from=html' + ' --to={1} -o "{2}" "{3}"').format( + parse_raw, out_markup, out_filename, html_filename) try: rc = subprocess.call(cmd, shell=True) @@ -279,6 +288,10 @@ def main(): help='Output markup format (supports rst & markdown)') parser.add_argument('--dir-cat', action='store_true', dest='dircat', help='Put files in directories with categories name') + parser.add_argument('--strip-raw', action='store_true', dest='strip_raw', + help="Strip raw HTML code that can't be converted to " + "markup such as flash embeds or iframes (wordpress import only)") + args = parser.parse_args() input_type = None @@ -306,4 +319,6 @@ def main(): elif input_type == 'feed': fields = feed2fields(args.input) - fields2pelican(fields, args.markup, args.output, dircat=args.dircat or False) + fields2pelican(fields, args.markup, args.output, + dircat=args.dircat or False, + strip_raw=args.strip_raw or False) diff --git a/tests/content/wordpressexport.xml b/tests/content/wordpressexport.xml new file mode 100644 index 00000000..d3e86cba --- /dev/null +++ b/tests/content/wordpressexport.xml @@ -0,0 +1,578 @@ + + + + + + + + + + + + + + + + + + + + + + + Pelican test channel + http://thisisa.test + Not a real feed, just for test + Sun, 13 May 2012 01:13:52 +0000 + en + 1.1 + http://thisisa.test + http://thisisa.test + + 2Bobbob@thisisa.test + 3Jonhjonh@thisisa.test + + 7categ-1 + 11categ-2 + 1uncategorized + 15categ-3 + 25tag-1 + 122tag2 + 68tag-3 + + http://wordpress.org/?v=3.3.1 + + + Empty post + http://thisisa.test/?attachment_id=24 + Sat, 04 Feb 2012 03:17:33 +0000 + bob + https://upload.wikimedia.org/wikipedia/commons/thumb/2/2c/Pelican_lakes_entrance02.jpg/240px-Pelican_lakes_entrance02.jpg + + + + 24 + 2012-02-04 03:17:33 + 2012-02-04 03:17:33 + open + open + empty-post + inherit + 0 + 0 + attachment + + 0 + https://upload.wikimedia.org/wikipedia/commons/thumb/2/2c/Pelican_lakes_entrance02.jpg/240px-Pelican_lakes_entrance02.jpg + + _wp_attachment_metadata + + + + _wp_attached_file + + + + _wp_attachment_image_alt + + + + + + http://thisisa.test/?p=168 + Thu, 01 Jan 1970 00:00:00 +0000 + bob + http://thisisa.test/?p=168 + + + + 168 + 2012-02-15 21:23:57 + 0000-00-00 00:00:00 + open + open + + draft + 0 + 0 + post + + 0 + + + _edit_last + + + + + A normal post + http://thisisa.test/?p=173 + Thu, 01 Jan 1970 00:00:00 +0000 + bob + http://thisisa.test/?p=173 + + +
  • Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
  • +
  • Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
  • + + +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]>
    + + 173 + 2012-02-16 15:52:55 + 0000-00-00 00:00:00 + open + open + + draft + 0 + 0 + post + + 0 + + + _edit_last + + +
    + + Complete draft + http://thisisa.test/?p=176 + Thu, 01 Jan 1970 00:00:00 +0000 + bob + http://thisisa.test/?p=176 + + + + 176 + 2012-02-17 15:11:55 + 0000-00-00 00:00:00 + open + open + + draft + 0 + 0 + post + + 0 + + + _edit_last + + + + + Page + http://thisisa.test/contact/ + Wed, 11 Apr 2012 11:38:08 +0000 + bob + http://thisisa.test/?page_id=334 + + + + 334 + 2012-04-11 06:38:08 + 2012-04-11 11:38:08 + open + open + contact + publish + 0 + 0 + page + + 0 + + sharing_disabled + + + + _wp_page_template + + + + _edit_last + + + + + Empty Page + http://thisisa.test/empty/ + Wed, 11 Apr 2012 11:38:08 +0000 + bob + http://thisisa.test/?page_id=334 + + + + 334 + 2012-04-11 06:38:08 + 2012-04-11 11:38:08 + open + open + empty + publish + 0 + 0 + page + + 0 + + sharing_disabled + + + + _wp_page_template + + + + _edit_last + + + + + Special chars: l'é + http://thisisa.test/?p=471 + Thu, 01 Jan 1970 00:00:00 +0000 + bob + http://thisisa.test/?p=471 + + + + 471 + 2012-04-29 09:44:27 + 0000-00-00 00:00:00 + open + open + + draft + 0 + 0 + post + + 0 + + + _edit_last + + + + + + With excerpt + http://thisisa.test/with-excerpt/ + Sat, 04 Feb 2012 02:03:06 +0000 + bob + http://thisisa.test/?p=8 + + + + 8 + 2012-02-04 02:03:06 + 2012-02-04 02:03:06 + open + open + with-excerpt + publish + 0 + 0 + post + + 0 + + + + + _edit_last + + + + et_bigpost + + + + _thumbnail_id + + + + + With tags + http://thisisa.test/tags/ + Sat, 04 Feb 2012 21:05:25 +0000 + bob + http://thisisa.test/?p=25 + + + + 25 + 2012-02-04 21:05:25 + 2012-02-04 21:05:25 + open + open + with-tags + publish + 0 + 0 + post + + 0 + + + + + + _edit_last + + + + et_bigpost + + + + _thumbnail_id + + + + + With comments + http://thisisa.test/with-comments/ + Wed, 18 Apr 2012 08:36:26 +0000 + john + http://thisisa.test/?p=422 + + + + 422 + 2012-04-18 03:36:26 + 2012-04-18 08:36:26 + open + open + with-comments + publish + 0 + 0 + post + + 0 + + + _edit_last + + + + _thumbnail_id + + + + 116 + + User2@mail.test + + 127.0.0.1 + 2012-05-06 15:46:06 + 2012-05-06 20:46:06 + + 1 + + 0 + 0 + + akismet_result + + + + akismet_history + + + + akismet_as_submitted + + + + + 117 + + bob@thisisa.test + + 127.0.0.1 + 2012-05-06 17:44:06 + 2012-05-06 22:44:06 + + 1 + + 116 + 3 + + akismet_result + + + + akismet_history + + + + akismet_as_submitted + + + + + 156 + + + http://thisisa.test/to-article-you-ping-back/ + 127.0.0.1 + 2012-05-09 19:30:19 + 2012-05-10 00:30:19 + + trash + pingback + 0 + 0 + + akismet_history + + + + _wp_trash_meta_status + + + + _wp_trash_meta_time + + + + + 122 + + bob@thisisa.test + + 127.0.0.1 + 2012-05-07 14:11:34 + 2012-05-07 19:11:34 + + 1 + + 121 + 3 + + akismet_result + + + + akismet_history + + + + akismet_as_submitted + + + + + + Post with raw data + http://thisisa.test/?p=173 + Thu, 01 Jan 1970 00:00:00 +0000 + bob + http://thisisa.test/?p=173 + + Pelicans are scary + +Pelicans are supposed to eat fish, damn it! + + + +Bottom line: don't mess up with birds]]> + + 173 + 2012-02-16 15:52:55 + 0000-00-00 00:00:00 + open + open + post-with-raw-data + publish + 0 + 0 + post + + 0 + + + _edit_last + + + +
    +
    diff --git a/tests/support.py b/tests/support.py index f2b4a075..994cd509 100644 --- a/tests/support.py +++ b/tests/support.py @@ -6,6 +6,11 @@ __all__ = [ import os import subprocess +import re +import sys +import cStringIO + +from functools import wraps from contextlib import contextmanager from tempfile import mkdtemp from shutil import rmtree @@ -28,8 +33,87 @@ def temporary_folder(): # do whatever you want """ tempdir = mkdtemp() - yield tempdir - rmtree(tempdir) + try: + yield tempdir + finally: + rmtree(tempdir) + + +def isplit(s, sep=None): + """ + Behave like str.split but returns a generator instead of a list. + + >>> list(isplit('\tUse the force\n')) == '\tUse the force\n'.split() + True + >>> list(isplit('\tUse the force\n')) == ['Use', 'the', 'force'] + True + >>> list(isplit('\tUse the force\n', "e")) == '\tUse the force\n'.split("e") + True + >>> list(isplit('Use the force', "e")) == 'Use the force'.split("e") + True + >>> list(isplit('Use the force', "e")) == ['Us', ' th', ' forc', ''] + True + + """ + sep, hardsep = r'\s+' if sep is None else re.escape(sep), sep is not None + exp, pos, l = re.compile(sep), 0, len(s) + while True: + m = exp.search(s, pos) + if not m: + if pos < l or hardsep: + # ^ mimic "split()": ''.split() returns [] + yield s[pos:] + break + start = m.start() + if pos < start or hardsep: + # ^ mimic "split()": includes trailing empty string + yield s[pos:start] + pos = m.end() + + +def mute(returns_output=False): + """ + Decorate a function that prints to stdout, intercepting the output. + If "returns_output" is True, the function will return a generator + yielding the printed lines instead of the return values. + + The decorator litterally hijack sys.stdout during each function + execution, so be careful with what you apply it to. + + >>> def numbers(): + print "42" + print "1984" + ... + >>> numbers() + 42 + 1984 + >>> mute()(numbers)() + >>> list(mute(True)(numbers)()) + ['42', '1984'] + + """ + + def decorator(func): + + @wraps(func) + def wrapper(*args, **kwargs): + + saved_stdout = sys.stdout + sys.stdout = cStringIO.StringIO() + + try: + out = func(*args, **kwargs) + if returns_output: + out = isplit(sys.stdout.getvalue().strip()) + finally: + sys.stdout = saved_stdout + + return out + + return wrapper + + return decorator + def get_article(title, slug, content, lang, extra_metadata=None): diff --git a/tests/test_importer.py b/tests/test_importer.py new file mode 100644 index 00000000..c6ee9cfb --- /dev/null +++ b/tests/test_importer.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +import os + +from pelican.tools.pelican_import import wp2fields, fields2pelican +from .support import unittest, temporary_folder, mute + +CUR_DIR = os.path.dirname(__file__) +WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml') + + +class TestWordpressXmlImporter(unittest.TestCase): + + + def setUp(self): + self.posts = wp2fields(WORDPRESS_XML_SAMPLE) + + + def test_ignore_empty_posts(self): + + posts = list(self.posts) + self.assertTrue(posts) + for title, content, fname, date, author, categ, tags, format in posts: + self.assertTrue(title.strip()) + + + def test_can_toggle_raw_html_code_parsing(self): + + posts = list(self.posts) + r = lambda f: open(f).read() + silent_f2p = mute(True)(fields2pelican) + + with temporary_folder() as temp: + + rst_files = (r(f) for f in silent_f2p(posts, 'markdown', temp)) + self.assertTrue(any('