'.format(p) for p in paragraphs]
new_content = ''.join(paragraphs)
- fp.write(content)
+ fp.write(new_content)
- cmd = 'pandoc --normalize --reference-links --from=html --to={0} -o "{1}" "{2}"'.format(
- out_markup, out_filename, html_filename)
+
+ parse_raw = '--parse-raw' if not strip_raw else ''
+ cmd = ('pandoc --normalize --reference-links {0} --from=html'
+ ' --to={1} -o "{2}" "{3}"').format(
+ parse_raw, out_markup, out_filename, html_filename)
try:
rc = subprocess.call(cmd, shell=True)
@@ -279,6 +288,10 @@ def main():
help='Output markup format (supports rst & markdown)')
parser.add_argument('--dir-cat', action='store_true', dest='dircat',
help='Put files in directories with categories name')
+ parser.add_argument('--strip-raw', action='store_true', dest='strip_raw',
+ help="Strip raw HTML code that can't be converted to "
+ "markup such as flash embeds or iframes (wordpress import only)")
+
args = parser.parse_args()
input_type = None
@@ -306,4 +319,6 @@ def main():
elif input_type == 'feed':
fields = feed2fields(args.input)
- fields2pelican(fields, args.markup, args.output, dircat=args.dircat or False)
+ fields2pelican(fields, args.markup, args.output,
+ dircat=args.dircat or False,
+ strip_raw=args.strip_raw or False)
diff --git a/tests/content/wordpressexport.xml b/tests/content/wordpressexport.xml
new file mode 100644
index 00000000..d3e86cba
--- /dev/null
+++ b/tests/content/wordpressexport.xml
@@ -0,0 +1,578 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Pelican test channel
+ http://thisisa.test
+ Not a real feed, just for test
+ Sun, 13 May 2012 01:13:52 +0000
+ en
+ 1.1
+ http://thisisa.test
+ http://thisisa.test
+
+ 2Bobbob@thisisa.test
+ 3Jonhjonh@thisisa.test
+
+ 7categ-1
+ 11categ-2
+ 1uncategorized
+ 15categ-3
+ 25tag-1
+ 122tag2
+ 68tag-3
+
+ http://wordpress.org/?v=3.3.1
+
+
+ Empty post
+ http://thisisa.test/?attachment_id=24
+ Sat, 04 Feb 2012 03:17:33 +0000
+ bob
+ https://upload.wikimedia.org/wikipedia/commons/thumb/2/2c/Pelican_lakes_entrance02.jpg/240px-Pelican_lakes_entrance02.jpg
+
+
+
+ 24
+ 2012-02-04 03:17:33
+ 2012-02-04 03:17:33
+ open
+ open
+ empty-post
+ inherit
+ 0
+ 0
+ attachment
+
+ 0
+ https://upload.wikimedia.org/wikipedia/commons/thumb/2/2c/Pelican_lakes_entrance02.jpg/240px-Pelican_lakes_entrance02.jpg
+
+ _wp_attachment_metadata
+
+
+
+ _wp_attached_file
+
+
+
+ _wp_attachment_image_alt
+
+
+
+
+
+ http://thisisa.test/?p=168
+ Thu, 01 Jan 1970 00:00:00 +0000
+ bob
+ http://thisisa.test/?p=168
+
+
+
+ 168
+ 2012-02-15 21:23:57
+ 0000-00-00 00:00:00
+ open
+ open
+
+ draft
+ 0
+ 0
+ post
+
+ 0
+
+
+ _edit_last
+
+
+
+
+ A normal post
+ http://thisisa.test/?p=173
+ Thu, 01 Jan 1970 00:00:00 +0000
+ bob
+ http://thisisa.test/?p=173
+
+
+
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
+tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
+quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
+consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
+cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
+proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
+tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
+quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
+consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
+cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
+proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+
+
+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
+tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
+quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
+consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
+cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
+proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]>
+
+ 173
+ 2012-02-16 15:52:55
+ 0000-00-00 00:00:00
+ open
+ open
+
+ draft
+ 0
+ 0
+ post
+
+ 0
+
+
+ _edit_last
+
+
+
+
+ Complete draft
+ http://thisisa.test/?p=176
+ Thu, 01 Jan 1970 00:00:00 +0000
+ bob
+ http://thisisa.test/?p=176
+
+
+
+ 176
+ 2012-02-17 15:11:55
+ 0000-00-00 00:00:00
+ open
+ open
+
+ draft
+ 0
+ 0
+ post
+
+ 0
+
+
+ _edit_last
+
+
+
+
+ Page
+ http://thisisa.test/contact/
+ Wed, 11 Apr 2012 11:38:08 +0000
+ bob
+ http://thisisa.test/?page_id=334
+
+
+
+ 334
+ 2012-04-11 06:38:08
+ 2012-04-11 11:38:08
+ open
+ open
+ contact
+ publish
+ 0
+ 0
+ page
+
+ 0
+
+ sharing_disabled
+
+
+
+ _wp_page_template
+
+
+
+ _edit_last
+
+
+
+
+ Empty Page
+ http://thisisa.test/empty/
+ Wed, 11 Apr 2012 11:38:08 +0000
+ bob
+ http://thisisa.test/?page_id=334
+
+
+
+ 334
+ 2012-04-11 06:38:08
+ 2012-04-11 11:38:08
+ open
+ open
+ empty
+ publish
+ 0
+ 0
+ page
+
+ 0
+
+ sharing_disabled
+
+
+
+ _wp_page_template
+
+
+
+ _edit_last
+
+
+
+
+ Special chars: l'é
+ http://thisisa.test/?p=471
+ Thu, 01 Jan 1970 00:00:00 +0000
+ bob
+ http://thisisa.test/?p=471
+
+
+
+ 471
+ 2012-04-29 09:44:27
+ 0000-00-00 00:00:00
+ open
+ open
+
+ draft
+ 0
+ 0
+ post
+
+ 0
+
+
+ _edit_last
+
+
+
+
+
+ With excerpt
+ http://thisisa.test/with-excerpt/
+ Sat, 04 Feb 2012 02:03:06 +0000
+ bob
+ http://thisisa.test/?p=8
+
+
+
+ 8
+ 2012-02-04 02:03:06
+ 2012-02-04 02:03:06
+ open
+ open
+ with-excerpt
+ publish
+ 0
+ 0
+ post
+
+ 0
+
+
+
+
+ _edit_last
+
+
+
+ et_bigpost
+
+
+
+ _thumbnail_id
+
+
+
+
+ With tags
+ http://thisisa.test/tags/
+ Sat, 04 Feb 2012 21:05:25 +0000
+ bob
+ http://thisisa.test/?p=25
+
+
+
+ 25
+ 2012-02-04 21:05:25
+ 2012-02-04 21:05:25
+ open
+ open
+ with-tags
+ publish
+ 0
+ 0
+ post
+
+ 0
+
+
+
+
+
+ _edit_last
+
+
+
+ et_bigpost
+
+
+
+ _thumbnail_id
+
+
+
+
+ With comments
+ http://thisisa.test/with-comments/
+ Wed, 18 Apr 2012 08:36:26 +0000
+ john
+ http://thisisa.test/?p=422
+
+
+
+ 422
+ 2012-04-18 03:36:26
+ 2012-04-18 08:36:26
+ open
+ open
+ with-comments
+ publish
+ 0
+ 0
+ post
+
+ 0
+
+
+ _edit_last
+
+
+
+ _thumbnail_id
+
+
+
+ 116
+
+ User2@mail.test
+
+ 127.0.0.1
+ 2012-05-06 15:46:06
+ 2012-05-06 20:46:06
+
+ 1
+
+ 0
+ 0
+
+ akismet_result
+
+
+
+ akismet_history
+
+
+
+ akismet_as_submitted
+
+
+
+
+ 117
+
+ bob@thisisa.test
+
+ 127.0.0.1
+ 2012-05-06 17:44:06
+ 2012-05-06 22:44:06
+
+ 1
+
+ 116
+ 3
+
+ akismet_result
+
+
+
+ akismet_history
+
+
+
+ akismet_as_submitted
+
+
+
+
+ 156
+
+
+ http://thisisa.test/to-article-you-ping-back/
+ 127.0.0.1
+ 2012-05-09 19:30:19
+ 2012-05-10 00:30:19
+
+ trash
+ pingback
+ 0
+ 0
+
+ akismet_history
+
+
+
+ _wp_trash_meta_status
+
+
+
+ _wp_trash_meta_time
+
+
+
+
+ 122
+
+ bob@thisisa.test
+
+ 127.0.0.1
+ 2012-05-07 14:11:34
+ 2012-05-07 19:11:34
+
+ 1
+
+ 121
+ 3
+
+ akismet_result
+
+
+
+ akismet_history
+
+
+
+ akismet_as_submitted
+
+
+
+
+
+ Post with raw data
+ http://thisisa.test/?p=173
+ Thu, 01 Jan 1970 00:00:00 +0000
+ bob
+ http://thisisa.test/?p=173
+
+ Pelicans are scary
+
+Pelicans are supposed to eat fish, damn it!
+
+
+
+Bottom line: don't mess up with birds]]>
+
+ 173
+ 2012-02-16 15:52:55
+ 0000-00-00 00:00:00
+ open
+ open
+ post-with-raw-data
+ publish
+ 0
+ 0
+ post
+
+ 0
+
+
+ _edit_last
+
+
+
+
+
diff --git a/tests/support.py b/tests/support.py
index f2b4a075..994cd509 100644
--- a/tests/support.py
+++ b/tests/support.py
@@ -6,6 +6,11 @@ __all__ = [
import os
import subprocess
+import re
+import sys
+import cStringIO
+
+from functools import wraps
from contextlib import contextmanager
from tempfile import mkdtemp
from shutil import rmtree
@@ -28,8 +33,87 @@ def temporary_folder():
# do whatever you want
"""
tempdir = mkdtemp()
- yield tempdir
- rmtree(tempdir)
+ try:
+ yield tempdir
+ finally:
+ rmtree(tempdir)
+
+
+def isplit(s, sep=None):
+ """
+ Behave like str.split but returns a generator instead of a list.
+
+ >>> list(isplit('\tUse the force\n')) == '\tUse the force\n'.split()
+ True
+ >>> list(isplit('\tUse the force\n')) == ['Use', 'the', 'force']
+ True
+ >>> list(isplit('\tUse the force\n', "e")) == '\tUse the force\n'.split("e")
+ True
+ >>> list(isplit('Use the force', "e")) == 'Use the force'.split("e")
+ True
+ >>> list(isplit('Use the force', "e")) == ['Us', ' th', ' forc', '']
+ True
+
+ """
+ sep, hardsep = r'\s+' if sep is None else re.escape(sep), sep is not None
+ exp, pos, l = re.compile(sep), 0, len(s)
+ while True:
+ m = exp.search(s, pos)
+ if not m:
+ if pos < l or hardsep:
+ # ^ mimic "split()": ''.split() returns []
+ yield s[pos:]
+ break
+ start = m.start()
+ if pos < start or hardsep:
+ # ^ mimic "split()": includes trailing empty string
+ yield s[pos:start]
+ pos = m.end()
+
+
+def mute(returns_output=False):
+ """
+ Decorate a function that prints to stdout, intercepting the output.
+ If "returns_output" is True, the function will return a generator
+ yielding the printed lines instead of the return values.
+
+ The decorator litterally hijack sys.stdout during each function
+ execution, so be careful with what you apply it to.
+
+ >>> def numbers():
+ print "42"
+ print "1984"
+ ...
+ >>> numbers()
+ 42
+ 1984
+ >>> mute()(numbers)()
+ >>> list(mute(True)(numbers)())
+ ['42', '1984']
+
+ """
+
+ def decorator(func):
+
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+
+ saved_stdout = sys.stdout
+ sys.stdout = cStringIO.StringIO()
+
+ try:
+ out = func(*args, **kwargs)
+ if returns_output:
+ out = isplit(sys.stdout.getvalue().strip())
+ finally:
+ sys.stdout = saved_stdout
+
+ return out
+
+ return wrapper
+
+ return decorator
+
def get_article(title, slug, content, lang, extra_metadata=None):
diff --git a/tests/test_importer.py b/tests/test_importer.py
new file mode 100644
index 00000000..c6ee9cfb
--- /dev/null
+++ b/tests/test_importer.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+
+import os
+
+from pelican.tools.pelican_import import wp2fields, fields2pelican
+from .support import unittest, temporary_folder, mute
+
+CUR_DIR = os.path.dirname(__file__)
+WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml')
+
+
+class TestWordpressXmlImporter(unittest.TestCase):
+
+
+ def setUp(self):
+ self.posts = wp2fields(WORDPRESS_XML_SAMPLE)
+
+
+ def test_ignore_empty_posts(self):
+
+ posts = list(self.posts)
+ self.assertTrue(posts)
+ for title, content, fname, date, author, categ, tags, format in posts:
+ self.assertTrue(title.strip())
+
+
+ def test_can_toggle_raw_html_code_parsing(self):
+
+ posts = list(self.posts)
+ r = lambda f: open(f).read()
+ silent_f2p = mute(True)(fields2pelican)
+
+ with temporary_folder() as temp:
+
+ rst_files = (r(f) for f in silent_f2p(posts, 'markdown', temp))
+ self.assertTrue(any('