1
0
Fork 0
forked from github/pelican

Extra functionality for pelican-import for wordpress imports

This commit is contained in:
Alistair Magee 2014-01-10 16:09:29 +00:00
commit ea3e160db1
3 changed files with 640 additions and 55 deletions

View file

@ -682,5 +682,272 @@ proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></con
<wp:meta_value><![CDATA[3]]></wp:meta_value> <wp:meta_value><![CDATA[3]]></wp:meta_value>
</wp:postmeta> </wp:postmeta>
</item> </item>
</channel> <item>
<title>A custom post in category 4</title>
<link>http://thisisa.test/?p=175</link>
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
<dc:creator>bob</dc:creator>
<guid isPermaLink="false">http://thisisa.test/?p=175</guid>
<description></description>
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
<ul>
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
</ul>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>175</wp:post_id>
<wp:post_date>2012-02-16 15:52:55</wp:post_date>
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>custpost1cat4</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>custom1</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
<category domain="category" nicename="category-4"><![CDATA[Category 4]]></category>
<wp:postmeta>
<wp:meta_key>_edit_last</wp:meta_key>
<wp:meta_value><![CDATA[3]]></wp:meta_value>
</wp:postmeta>
</item>
<item>
<title>A custom post in category 5</title>
<link>http://thisisa.test/?p=176</link>
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
<dc:creator>bob</dc:creator>
<guid isPermaLink="false">http://thisisa.test/?p=176</guid>
<description></description>
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
<ul>
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
</ul>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>176</wp:post_id>
<wp:post_date>2012-02-16 15:52:55</wp:post_date>
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>custpost1cat5</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>custom1</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
<category domain="category" nicename="category-5"><![CDATA[Category 5]]></category>
<wp:postmeta>
<wp:meta_key>_edit_last</wp:meta_key>
<wp:meta_value><![CDATA[3]]></wp:meta_value>
</wp:postmeta>
</item>
<item>
<title>A 2nd custom post type also in category 5</title>
<link>http://thisisa.test/?p=177</link>
<pubDate>Thu, 01 Jan 1970 00:00:00 +0000</pubDate>
<dc:creator>bob</dc:creator>
<guid isPermaLink="false">http://thisisa.test/?p=177</guid>
<description></description>
<content:encoded><![CDATA[Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
<ul>
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
<li>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</li>
</ul>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>177</wp:post_id>
<wp:post_date>2012-02-16 15:52:55</wp:post_date>
<wp:post_date_gmt>0000-00-00 00:00:00</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>custpost2cat5</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>custom2</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
<category domain="category" nicename="category-5"><![CDATA[Category 5]]></category>
<wp:postmeta>
<wp:meta_key>_edit_last</wp:meta_key>
<wp:meta_value><![CDATA[3]]></wp:meta_value>
</wp:postmeta>
</item>
<item>
<title>Attachment with a parent</title>
<link>http://thisisa.test/?attachment_id=24</link>
<pubDate>Sat, 04 Feb 2012 03:17:33 +0000</pubDate>
<dc:creator>bob</dc:creator>
<guid isPermaLink="false">http://thisurlisinvalid.notarealdomain/not_an_image.jpg</guid>
<description></description>
<content:encoded><![CDATA[]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>25</wp:post_id>
<wp:post_date>2012-02-04 03:17:33</wp:post_date>
<wp:post_date_gmt>2012-02-04 03:17:33</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>attachment-with-a-parent</wp:post_name>
<wp:status>inherit</wp:status>
<wp:post_parent>8</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>attachment</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
<wp:attachment_url>http://thisurlisinvalid.notarealdomain/not_an_image.jpg</wp:attachment_url>
<wp:postmeta>
<wp:meta_key>_wp_attachment_metadata</wp:meta_key>
<wp:meta_value><![CDATA[a:5:{s:5:"width";s:3:"150";s:6:"height";s:3:"186";s:14:"hwstring_small";s:22:"height='96' width='77'";s:4:"file";s:20:"2012/02/pelican.png";s:10:"image_meta";a:10:{s:8:"aperture";s:1:"0";s:6:"credit";s:0:"";s:6:"camera";s:0:"";s:7:"caption";s:0:"";s:17:"created_timestamp";s:1:"0";s:9:"copyright";s:0:"";s:12:"focal_length";s:1:"0";s:3:"iso";s:1:"0";s:13:"shutter_speed";s:1:"0";s:5:"title";s:0:"";}}]]></wp:meta_value>
</wp:postmeta>
<wp:postmeta>
<wp:meta_key>_wp_attached_file</wp:meta_key>
<wp:meta_value><![CDATA[2012/02/stuff.png]]></wp:meta_value>
</wp:postmeta>
<wp:postmeta>
<wp:meta_key>_wp_attachment_image_alt</wp:meta_key>
<wp:meta_value><![CDATA[Stuff]]></wp:meta_value>
</wp:postmeta>
</item>
<item>
<title>2nd Attachment to same parent</title>
<link>http://thisisa.test/?attachment_id=25</link>
<pubDate>Sat, 04 Feb 2012 03:17:33 +0000</pubDate>
<dc:creator>bob</dc:creator>
<guid isPermaLink="false">http://en.wikipedia.org/wiki/File:Pelikan_Walvis_Bay.jpg</guid>
<description></description>
<content:encoded><![CDATA[]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>25</wp:post_id>
<wp:post_date>2012-02-04 03:17:33</wp:post_date>
<wp:post_date_gmt>2012-02-04 03:17:33</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>2nd[attachment-to-same-parent</wp:post_name>
<wp:status>inherit</wp:status>
<wp:post_parent>8</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>attachment</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
<wp:attachment_url>http://en.wikipedia.org/wiki/File:Pelikan_Walvis_Bay.jpg</wp:attachment_url>
<wp:postmeta>
<wp:meta_key>_wp_attachment_metadata</wp:meta_key>
<wp:meta_value><![CDATA[a:5:{s:5:"width";s:3:"150";s:6:"height";s:3:"186";s:14:"hwstring_small";s:22:"height='96' width='77'";s:4:"file";s:20:"2012/02/pelican.png";s:10:"image_meta";a:10:{s:8:"aperture";s:1:"0";s:6:"credit";s:0:"";s:6:"camera";s:0:"";s:7:"caption";s:0:"";s:17:"created_timestamp";s:1:"0";s:9:"copyright";s:0:"";s:12:"focal_length";s:1:"0";s:3:"iso";s:1:"0";s:13:"shutter_speed";s:1:"0";s:5:"title";s:0:"";}}]]></wp:meta_value>
</wp:postmeta>
<wp:postmeta>
<wp:meta_key>_wp_attached_file</wp:meta_key>
<wp:meta_value><![CDATA[2012/02/stuff.png]]></wp:meta_value>
</wp:postmeta>
<wp:postmeta>
<wp:meta_key>_wp_attachment_image_alt</wp:meta_key>
<wp:meta_value><![CDATA[Stuff]]></wp:meta_value>
</wp:postmeta>
</item>
<item>
<title>Attachment with a different parent</title>
<link>http://thisisa.test/?attachment_id=26</link>
<pubDate>Sat, 04 Feb 2012 03:17:33 +0000</pubDate>
<dc:creator>bob</dc:creator>
<guid isPermaLink="false">http://thisurlisinvalid.notarealdomain</guid>
<description></description>
<content:encoded><![CDATA[]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>25</wp:post_id>
<wp:post_date>2012-02-04 03:17:33</wp:post_date>
<wp:post_date_gmt>2012-02-04 03:17:33</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>attachment-with-a-different-parent</wp:post_name>
<wp:status>inherit</wp:status>
<wp:post_parent>25</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>attachment</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
<wp:attachment_url>http://thisurlisinvalid.notarealdomain</wp:attachment_url>
<wp:postmeta>
<wp:meta_key>_wp_attachment_metadata</wp:meta_key>
<wp:meta_value><![CDATA[a:5:{s:5:"width";s:3:"150";s:6:"height";s:3:"186";s:14:"hwstring_small";s:22:"height='96' width='77'";s:4:"file";s:20:"2012/02/pelican.png";s:10:"image_meta";a:10:{s:8:"aperture";s:1:"0";s:6:"credit";s:0:"";s:6:"camera";s:0:"";s:7:"caption";s:0:"";s:17:"created_timestamp";s:1:"0";s:9:"copyright";s:0:"";s:12:"focal_length";s:1:"0";s:3:"iso";s:1:"0";s:13:"shutter_speed";s:1:"0";s:5:"title";s:0:"";}}]]></wp:meta_value>
</wp:postmeta>
<wp:postmeta>
<wp:meta_key>_wp_attached_file</wp:meta_key>
<wp:meta_value><![CDATA[2012/02/stuff.png]]></wp:meta_value>
</wp:postmeta>
<wp:postmeta>
<wp:meta_key>_wp_attachment_image_alt</wp:meta_key>
<wp:meta_value><![CDATA[Stuff]]></wp:meta_value>
</wp:postmeta>
</item>
</channel>
</rss> </rss>

View file

@ -4,10 +4,12 @@ from __future__ import unicode_literals, print_function
import os import os
import re import re
from pelican.tools.pelican_import import wp2fields, fields2pelican, decode_wp_content, build_header from pelican.tools.pelican_import import wp2fields, fields2pelican, decode_wp_content, build_header, build_markdown_header, get_attachments, download_attachments
from pelican.tests.support import (unittest, temporary_folder, mute, from pelican.tests.support import (unittest, temporary_folder, mute,
skipIfNoExecutable) skipIfNoExecutable)
from pelican.utils import slugify
CUR_DIR = os.path.dirname(__file__) CUR_DIR = os.path.dirname(__file__)
WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml') WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml')
WORDPRESS_ENCODED_CONTENT_SAMPLE = os.path.join(CUR_DIR, WORDPRESS_ENCODED_CONTENT_SAMPLE = os.path.join(CUR_DIR,
@ -29,6 +31,7 @@ class TestWordpressXmlImporter(unittest.TestCase):
def setUp(self): def setUp(self):
self.posts = list(wp2fields(WORDPRESS_XML_SAMPLE)) self.posts = list(wp2fields(WORDPRESS_XML_SAMPLE))
self.custposts = list(wp2fields(WORDPRESS_XML_SAMPLE, True))
def test_ignore_empty_posts(self): def test_ignore_empty_posts(self):
self.assertTrue(self.posts) self.assertTrue(self.posts)
@ -54,6 +57,112 @@ class TestWordpressXmlImporter(unittest.TestCase):
fname = list(silent_f2p(test_post, 'markdown', temp, dirpage=True))[0] fname = list(silent_f2p(test_post, 'markdown', temp, dirpage=True))[0]
self.assertTrue(fname.endswith('pages%sempty.md' % os.path.sep)) self.assertTrue(fname.endswith('pages%sempty.md' % os.path.sep))
def test_dircat(self):
silent_f2p = mute(True)(fields2pelican)
test_posts = []
for post in self.posts:
# check post kind
if len(post[5]) > 0: # Has a category
test_posts.append(post)
with temporary_folder() as temp:
fnames = list(silent_f2p(test_posts, 'markdown', temp, dircat=True))
index = 0
for post in test_posts:
name = post[2]
category = slugify(post[5][0])
name += '.md'
filename = os.path.join(category, name)
out_name = fnames[index]
self.assertTrue(out_name.endswith(filename))
index += 1
def test_unless_custom_post_all_items_should_be_pages_or_posts(self):
self.assertTrue(self.posts)
pages_data = []
for title, content, fname, date, author, categ, tags, kind, format in self.posts:
if kind == 'page' or kind == 'article':
pass
else:
pages_data.append((title, fname))
self.assertEqual(0, len(pages_data))
def test_recognise_custom_post_type(self):
self.assertTrue(self.custposts)
cust_data = []
for title, content, fname, date, author, categ, tags, kind, format in self.custposts:
if kind == 'article' or kind == 'page':
pass
else:
cust_data.append((title, kind))
self.assertEqual(3, len(cust_data))
self.assertEqual(('A custom post in category 4', 'custom1'), cust_data[0])
self.assertEqual(('A custom post in category 5', 'custom1'), cust_data[1])
self.assertEqual(('A 2nd custom post type also in category 5', 'custom2'), cust_data[2])
def test_custom_posts_put_in_own_dir(self):
silent_f2p = mute(True)(fields2pelican)
test_posts = []
for post in self.custposts:
# check post kind
if post[7] == 'article' or post[7] == 'page':
pass
else:
test_posts.append(post)
with temporary_folder() as temp:
fnames = list(silent_f2p(test_posts, 'markdown', temp, wp_custpost = True))
index = 0
for post in test_posts:
name = post[2]
kind = post[7]
name += '.md'
filename = os.path.join(kind, name)
out_name = fnames[index]
self.assertTrue(out_name.endswith(filename))
index += 1
def test_custom_posts_put_in_own_dir_and_catagory_sub_dir(self):
silent_f2p = mute(True)(fields2pelican)
test_posts = []
for post in self.custposts:
# check post kind
if post[7] == 'article' or post[7] == 'page':
pass
else:
test_posts.append(post)
with temporary_folder() as temp:
fnames = list(silent_f2p(test_posts, 'markdown', temp,
wp_custpost=True, dircat=True))
index = 0
for post in test_posts:
name = post[2]
kind = post[7]
category = slugify(post[5][0])
name += '.md'
filename = os.path.join(kind, category, name)
out_name = fnames[index]
self.assertTrue(out_name.endswith(filename))
index += 1
def test_wp_custpost_true_dirpage_false(self):
#pages should only be put in their own directory when dirpage = True
silent_f2p = mute(True)(fields2pelican)
test_posts = []
for post in self.custposts:
# check post kind
if post[7] == 'page':
test_posts.append(post)
with temporary_folder() as temp:
fnames = list(silent_f2p(test_posts, 'markdown', temp,
wp_custpost=True, dirpage=False))
index = 0
for post in test_posts:
name = post[2]
name += '.md'
filename = os.path.join('pages', name)
out_name = fnames[index]
self.assertFalse(out_name.endswith(filename))
def test_can_toggle_raw_html_code_parsing(self): def test_can_toggle_raw_html_code_parsing(self):
def r(f): def r(f):
with open(f) as infile: with open(f) as infile:
@ -137,3 +246,48 @@ class TestBuildHeader(unittest.TestCase):
'これは広い幅の文字だけで構成されたタイトルです\n' + 'これは広い幅の文字だけで構成されたタイトルです\n' +
'##############################################\n\n') '##############################################\n\n')
def test_galleries_added_to_header(self):
header = build_header('test', None, None, None, None,
None, ['output/test1', 'output/test2'])
self.assertEqual(header, 'test\n####\n' + ':attachments: output/test1, '
+ 'output/test2\n\n')
def test_galleries_added_to_markdown_header(self):
header = build_markdown_header('test', None, None, None, None, None,
['output/test1', 'output/test2'])
self.assertEqual(header, 'Title: test\n' + 'Attachments: output/test1, '
+ 'output/test2\n\n')
@unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module')
class TestWordpressXMLAttachements(unittest.TestCase):
def setUp(self):
self.attachments = get_attachments(WORDPRESS_XML_SAMPLE)
def test_recognise_attachments(self):
self.assertTrue(self.attachments)
self.assertTrue(len(self.attachments.keys()) == 3)
def test_attachments_associated_with_correct_post(self):
self.assertTrue(self.attachments)
for post in self.attachments.keys():
if post is None:
self.assertTrue(self.attachments[post][0] == 'https://upload.wikimedia.org/wikipedia/commons/thumb/2/2c/Pelican_lakes_entrance02.jpg/240px-Pelican_lakes_entrance02.jpg')
elif post == 'with-excerpt':
self.assertTrue(self.attachments[post][0] == 'http://thisurlisinvalid.notarealdomain/not_an_image.jpg')
self.assertTrue(self.attachments[post][1] == 'http://en.wikipedia.org/wiki/File:Pelikan_Walvis_Bay.jpg')
elif post == 'with-tags':
self.assertTrue(self.attachments[post][0] == 'http://thisurlisinvalid.notarealdomain')
else:
self.fail('all attachments should match to a filename or None, {}'.format(post))
def test_download_attachments(self):
real_file = os.path.join(CUR_DIR, 'content/article.rst')
good_url = 'file://' + real_file
bad_url = 'http://www.notarealsite.notarealdomain/not_a_file.txt'
silent_da = mute()(download_attachments)
with temporary_folder() as temp:
#locations = download_attachments(temp, [good_url, bad_url])
locations = list(silent_da(temp, [good_url, bad_url]))
self.assertTrue(len(locations) == 1)
directory = locations[0]
self.assertTrue(directory.endswith('content/article.rst'))

View file

@ -6,9 +6,15 @@ import argparse
try: try:
# py3k import # py3k import
from html.parser import HTMLParser from html.parser import HTMLParser
from urllib.request import urlretrieve
from urllib.parse import urlparse
from urllib.error import URLError
except ImportError: except ImportError:
# py2 import # py2 import
from HTMLParser import HTMLParser # NOQA from HTMLParser import HTMLParser # NOQA
from urllib import urlretrieve
from urlparse import urlparse
from urllib2 import URLError
import os import os
import re import re
import subprocess import subprocess
@ -96,22 +102,30 @@ def decode_wp_content(content, br=True):
return content return content
def get_items(xml):
def wp2fields(xml): """Opens a wordpress xml file and returns a list of items"""
"""Opens a wordpress XML file, and yield Pelican fields"""
try: try:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
except ImportError: except ImportError:
error = ('Missing dependency ' error = ('Missing dependency '
'"BeautifulSoup4" and "lxml" required to import Wordpress XML files.') '"BeautifulSoup4" and "lxml" required to import Wordpress XML files.')
sys.exit(error) sys.exit(error)
with open(xml, encoding='utf-8') as infile: with open(xml, encoding='utf-8') as infile:
xmlfile = infile.read() xmlfile = infile.read()
soup = BeautifulSoup(xmlfile, "xml") soup = BeautifulSoup(xmlfile, "xml")
items = soup.rss.channel.findAll('item') items = soup.rss.channel.findAll('item')
return items
def get_filename(filename, post_id):
if filename is not None:
return filename
else:
return post_id
def wp2fields(xml, wp_custpost=False):
"""Opens a wordpress XML file, and yield Pelican fields"""
items = get_items(xml)
for item in items: for item in items:
if item.find('status').string == "publish": if item.find('status').string == "publish":
@ -123,12 +137,11 @@ def wp2fields(xml):
title = 'No title [%s]' % item.find('post_name').string title = 'No title [%s]' % item.find('post_name').string
logger.warn('Post "%s" is lacking a proper title' % title) logger.warn('Post "%s" is lacking a proper title' % title)
content = item.find('encoded').string
filename = item.find('post_name').string filename = item.find('post_name').string
post_id = item.find('post_id').string
filename = get_filename(filename, post_id)
if filename is None: content = item.find('encoded').string
filename = item.find('post_id').string
raw_date = item.find('post_date').string raw_date = item.find('post_date').string
date_object = time.strptime(raw_date, "%Y-%m-%d %H:%M:%S") date_object = time.strptime(raw_date, "%Y-%m-%d %H:%M:%S")
date = time.strftime("%Y-%m-%d %H:%M", date_object) date = time.strftime("%Y-%m-%d %H:%M", date_object)
@ -140,9 +153,20 @@ def wp2fields(xml):
tags = [tag.string for tag in item.findAll('category', {'domain' : 'post_tag'})] tags = [tag.string for tag in item.findAll('category', {'domain' : 'post_tag'})]
kind = 'article' kind = 'article'
if item.find('post_type').string == 'page': post_type = item.find('post_type').string
if post_type == 'page':
kind = 'page' kind = 'page'
elif wp_custpost:
if post_type == 'post':
pass
# Old behaviour was to name everything not a page as an article.
# Theoretically all attachments have status == inherit so
# no attachments should be here. But this statement is to
# maintain existing behaviour in case that doesn't hold true.
elif post_type == 'attachment':
pass
else:
kind = post_type
yield (title, content, filename, date, author, categories, tags, yield (title, content, filename, date, author, categories, tags,
kind, "wp-html") kind, "wp-html")
@ -410,7 +434,6 @@ def tumblr2fields(api_key, blogname):
offset += len(posts) offset += len(posts)
posts = get_tumblr_posts(api_key, blogname, offset) posts = get_tumblr_posts(api_key, blogname, offset)
def feed2fields(file): def feed2fields(file):
"""Read a feed and yield pelican fields""" """Read a feed and yield pelican fields"""
import feedparser import feedparser
@ -426,8 +449,7 @@ def feed2fields(file):
yield (entry.title, entry.description, slug, date, author, [], tags, yield (entry.title, entry.description, slug, date, author, [], tags,
kind, "html") kind, "html")
def build_header(title, date, author, categories, tags, slug, attachments=None):
def build_header(title, date, author, categories, tags, slug):
from docutils.utils import column_width from docutils.utils import column_width
"""Build a header from a list of fields""" """Build a header from a list of fields"""
@ -442,10 +464,13 @@ def build_header(title, date, author, categories, tags, slug):
header += ':tags: %s\n' % ', '.join(tags) header += ':tags: %s\n' % ', '.join(tags)
if slug: if slug:
header += ':slug: %s\n' % slug header += ':slug: %s\n' % slug
if attachments:
header += ':attachments: %s\n' % ', '.join(attachments)
header += '\n' header += '\n'
return header return header
def build_markdown_header(title, date, author, categories, tags, slug): def build_markdown_header(title, date, author, categories, tags, slug,
attachments=None):
"""Build a header from a list of fields""" """Build a header from a list of fields"""
header = 'Title: %s\n' % title header = 'Title: %s\n' % title
if date: if date:
@ -458,25 +483,20 @@ def build_markdown_header(title, date, author, categories, tags, slug):
header += 'Tags: %s\n' % ', '.join(tags) header += 'Tags: %s\n' % ', '.join(tags)
if slug: if slug:
header += 'Slug: %s\n' % slug header += 'Slug: %s\n' % slug
if attachments:
header += 'Attachments: %s\n' % ', '.join(attachments)
header += '\n' header += '\n'
return header return header
def fields2pelican(fields, out_markup, output_path, def get_ext(out_markup, in_markup='html'):
dircat=False, strip_raw=False, disable_slugs=False, if in_markup == 'markdown' or out_markup == 'markdown':
dirpage=False, filename_template=None, filter_author=None):
for (title, content, filename, date, author, categories, tags,
kind, in_markup) in fields:
if filter_author and filter_author != author:
continue
slug = not disable_slugs and filename or None
if (in_markup == "markdown") or (out_markup == "markdown") :
ext = '.md' ext = '.md'
header = build_markdown_header(title, date, author, categories, tags, slug)
else: else:
out_markup = "rst"
ext = '.rst' ext = '.rst'
header = build_header(title, date, author, categories, tags, slug) return ext
def get_out_filename(output_path, filename, ext, kind,
dirpage, dircat, categories, wp_custpost):
filename = os.path.basename(filename) filename = os.path.basename(filename)
# Enforce filename restrictions for various filesystems at once; see # Enforce filename restrictions for various filesystems at once; see
@ -488,21 +508,137 @@ def fields2pelican(fields, out_markup, output_path,
filename = '_' filename = '_'
filename = filename[:249] # allow for 5 extra characters filename = filename[:249] # allow for 5 extra characters
out_filename = os.path.join(output_path, filename+ext)
# option to put page posts in pages/ subdirectory # option to put page posts in pages/ subdirectory
if dirpage and kind == 'page': if dirpage and kind == 'page':
pages_dir = os.path.join(output_path, 'pages') pages_dir = os.path.join(output_path, 'pages')
if not os.path.isdir(pages_dir): if not os.path.isdir(pages_dir):
os.mkdir(pages_dir) os.mkdir(pages_dir)
out_filename = os.path.join(pages_dir, filename+ext) out_filename = os.path.join(pages_dir, filename+ext)
elif not dirpage and kind == 'page':
pass
# option to put wp custom post types in directories with post type
# names. Custom post types can also have categories so option to
# create subdirectories with category names
elif kind != 'article':
if wp_custpost:
typename = slugify(kind)
else:
typename = ''
kind = 'article'
if dircat and (len(categories) > 0):
catname = slugify(categories[0])
else:
catname = ''
out_filename = os.path.join(output_path, typename,
catname, filename+ext)
if not os.path.isdir(os.path.join(output_path, typename, catname)):
os.makedirs(os.path.join(output_path, typename, catname))
# option to put files in directories with categories names # option to put files in directories with categories names
elif dircat and (len(categories) > 0): elif dircat and (len(categories) > 0):
catname = slugify(categories[0]) catname = slugify(categories[0])
out_filename = os.path.join(output_path, catname, filename+ext) out_filename = os.path.join(output_path, catname, filename+ext)
if not os.path.isdir(os.path.join(output_path, catname)): if not os.path.isdir(os.path.join(output_path, catname)):
os.mkdir(os.path.join(output_path, catname)) os.mkdir(os.path.join(output_path, catname))
else:
out_filename = os.path.join(output_path, filename+ext)
return out_filename
def get_attachments(xml):
"""returns a dictionary of posts that have attachments with a list
of the attachment_urls
"""
items = get_items(xml)
names = {}
attachments = []
for item in items:
kind = item.find('post_type').string
filename = item.find('post_name').string
post_id = item.find('post_id').string
if kind == 'attachment':
attachments.append((item.find('post_parent').string,
item.find('attachment_url').string))
else:
filename = get_filename(filename, post_id)
names[post_id] = filename
attachedposts = {}
for parent, url in attachments:
try:
parent_name = names[parent]
except KeyError:
#attachment's parent is not a valid post
parent_name = None
try:
attachedposts[parent_name].append(url)
except KeyError:
attachedposts[parent_name] = []
attachedposts[parent_name].append(url)
return attachedposts
def download_attachments(output_path, urls):
"""Downloads wordpress attachments and returns a list of paths to
attachments that can be associated with a post (relative path to output
directory). Files that fail to download, will not be added to posts"""
locations = []
for url in urls:
path = urlparse(url).path
#teardown path and rebuild to negate any errors with
#os.path.join and leading /'s
path = path.split('/')
filename = path.pop(-1)
localpath = ''
for item in path:
localpath = os.path.join(localpath, item)
full_path = os.path.join(output_path, localpath)
if not os.path.exists(full_path):
os.makedirs(full_path)
print('downloading {}'.format(filename))
try:
urlretrieve(url, os.path.join(full_path, filename))
locations.append(os.path.join(localpath, filename))
except URLError as e:
error = ("No file could be downloaded from {}; Error {}"
.format(url, e))
logger.warn(error)
except IOError as e: #Python 2.7 throws an IOError rather Than URLError
error = ("No file could be downloaded from {}; Error {}"
.format(url, e))
logger.warn(error)
return locations
def fields2pelican(fields, out_markup, output_path,
dircat=False, strip_raw=False, disable_slugs=False,
dirpage=False, filename_template=None, filter_author=None,
wp_custpost=False, wp_attach=False, attachments=None):
for (title, content, filename, date, author, categories, tags,
kind, in_markup) in fields:
if filter_author and filter_author != author:
continue
slug = not disable_slugs and filename or None
if wp_attach and attachments:
try:
urls = attachments[filename]
attached_files = download_attachments(output_path, urls)
except KeyError:
attached_files = None
else:
attached_files = None
ext = get_ext(out_markup, in_markup)
if ext == '.md':
header = build_markdown_header(title, date, author, categories,
tags, slug, attached_files)
else:
out_markup = "rst"
header = build_header(title, date, author, categories,
tags, slug, attached_files)
out_filename = get_out_filename(output_path, filename, ext,
kind, dirpage, dircat, categories, wp_custpost)
print(out_filename) print(out_filename)
if in_markup in ("html", "wp-html"): if in_markup in ("html", "wp-html"):
@ -550,7 +686,10 @@ def fields2pelican(fields, out_markup, output_path,
with open(out_filename, 'w', encoding='utf-8') as fs: with open(out_filename, 'w', encoding='utf-8') as fs:
fs.write(header + content) fs.write(header + content)
if wp_attach and attachments and None in attachments:
print("downloading attachments that don't have a parent post")
urls = attachments[None]
orphan_galleries = download_attachments(output_path, urls)
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
@ -584,6 +723,19 @@ def main():
parser.add_argument('--strip-raw', action='store_true', dest='strip_raw', parser.add_argument('--strip-raw', action='store_true', dest='strip_raw',
help="Strip raw HTML code that can't be converted to " help="Strip raw HTML code that can't be converted to "
"markup such as flash embeds or iframes (wordpress import only)") "markup such as flash embeds or iframes (wordpress import only)")
parser.add_argument('--wp-custpost', action='store_true',
dest='wp_custpost',
help='Put wordpress custom post types in directories. If used with '
'--dir-cat option directories will be created as '
'/post_type/category/ (wordpress import only)')
parser.add_argument('--wp-attach', action='store_true', dest='wp_attach',
help='(wordpress import only) Download files uploaded to wordpress as '
'attachments. Files will be added to posts as a list in the post '
'header. All files will be downloaded, even if '
"they aren't associated with a post. Files with be downloaded "
'with their original path inside the output directory. '
'e.g. output/wp-uploads/date/postname/file.jpg '
'-- Requires an internet connection --')
parser.add_argument('--disable-slugs', action='store_true', parser.add_argument('--disable-slugs', action='store_true',
dest='disable_slugs', dest='disable_slugs',
help='Disable storing slugs from imported posts within output. ' help='Disable storing slugs from imported posts within output. '
@ -620,8 +772,12 @@ def main():
error = "Unable to create the output folder: " + args.output error = "Unable to create the output folder: " + args.output
exit(error) exit(error)
if args.wp_attach and input_type != 'wordpress':
error = "You must be importing a wordpress xml to use the --wp-attach option"
exit(error)
if input_type == 'wordpress': if input_type == 'wordpress':
fields = wp2fields(args.input) fields = wp2fields(args.input, args.wp_custpost or False)
elif input_type == 'dotclear': elif input_type == 'dotclear':
fields = dc2fields(args.input) fields = dc2fields(args.input)
elif input_type == 'posterous': elif input_type == 'posterous':
@ -631,6 +787,11 @@ def main():
elif input_type == 'feed': elif input_type == 'feed':
fields = feed2fields(args.input) fields = feed2fields(args.input)
if args.wp_attach:
attachments = get_attachments(args.input)
else:
attachments = None
init() # init logging init() # init logging
fields2pelican(fields, args.markup, args.output, fields2pelican(fields, args.markup, args.output,
@ -638,4 +799,7 @@ def main():
dirpage=args.dirpage or False, dirpage=args.dirpage or False,
strip_raw=args.strip_raw or False, strip_raw=args.strip_raw or False,
disable_slugs=args.disable_slugs or False, disable_slugs=args.disable_slugs or False,
filter_author=args.author) filter_author=args.author,
wp_custpost = args.wp_custpost or False,
wp_attach = args.wp_attach or False,
attachments = attachments or None)