From ea3e160db12be85e9c56dab204227ca22b65f71d Mon Sep 17 00:00:00 2001 From: Alistair Magee Date: Fri, 10 Jan 2014 16:09:29 +0000 Subject: [PATCH] Extra functionality for pelican-import for wordpress imports --- pelican/tests/content/wordpressexport.xml | 269 ++++++++++++++++++++- pelican/tests/test_importer.py | 156 ++++++++++++- pelican/tools/pelican_import.py | 270 +++++++++++++++++----- 3 files changed, 640 insertions(+), 55 deletions(-) diff --git a/pelican/tests/content/wordpressexport.xml b/pelican/tests/content/wordpressexport.xml index 56d9a458..686b0fa7 100644 --- a/pelican/tests/content/wordpressexport.xml +++ b/pelican/tests/content/wordpressexport.xml @@ -681,6 +681,273 @@ proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]>_edit_last + + + A custom post in category 4 + http://thisisa.test/?p=175 + Thu, 01 Jan 1970 00:00:00 +0000 + bob + http://thisisa.test/?p=175 + + +
  • Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
  • +
  • Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
  • + + +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]>
    + + 175 + 2012-02-16 15:52:55 + 0000-00-00 00:00:00 + open + open + custpost1cat4 + publish + 0 + 0 + custom1 + + 0 + + + _edit_last + +
    - + + A custom post in category 5 + http://thisisa.test/?p=176 + Thu, 01 Jan 1970 00:00:00 +0000 + bob + http://thisisa.test/?p=176 + + +
  • Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
  • +
  • Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
  • + + +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]>
    + + 176 + 2012-02-16 15:52:55 + 0000-00-00 00:00:00 + open + open + custpost1cat5 + publish + 0 + 0 + custom1 + + 0 + + + _edit_last + + +
    + + A 2nd custom post type also in category 5 + http://thisisa.test/?p=177 + Thu, 01 Jan 1970 00:00:00 +0000 + bob + http://thisisa.test/?p=177 + + +
  • Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
  • +
  • Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
  • + + +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, +quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo +consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse +cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non +proident, sunt in culpa qui officia deserunt mollit anim id est laborum.]]>
    + + 177 + 2012-02-16 15:52:55 + 0000-00-00 00:00:00 + open + open + custpost2cat5 + publish + 0 + 0 + custom2 + + 0 + + + _edit_last + + +
    + + Attachment with a parent + http://thisisa.test/?attachment_id=24 + Sat, 04 Feb 2012 03:17:33 +0000 + bob + http://thisurlisinvalid.notarealdomain/not_an_image.jpg + + + + 25 + 2012-02-04 03:17:33 + 2012-02-04 03:17:33 + open + open + attachment-with-a-parent + inherit + 8 + 0 + attachment + + 0 + http://thisurlisinvalid.notarealdomain/not_an_image.jpg + + _wp_attachment_metadata + + + + _wp_attached_file + + + + _wp_attachment_image_alt + + + + + 2nd Attachment to same parent + http://thisisa.test/?attachment_id=25 + Sat, 04 Feb 2012 03:17:33 +0000 + bob + http://en.wikipedia.org/wiki/File:Pelikan_Walvis_Bay.jpg + + + + 25 + 2012-02-04 03:17:33 + 2012-02-04 03:17:33 + open + open + 2nd[attachment-to-same-parent + inherit + 8 + 0 + attachment + + 0 + http://en.wikipedia.org/wiki/File:Pelikan_Walvis_Bay.jpg + + _wp_attachment_metadata + + + + _wp_attached_file + + + + _wp_attachment_image_alt + + + + + Attachment with a different parent + http://thisisa.test/?attachment_id=26 + Sat, 04 Feb 2012 03:17:33 +0000 + bob + http://thisurlisinvalid.notarealdomain + + + + 25 + 2012-02-04 03:17:33 + 2012-02-04 03:17:33 + open + open + attachment-with-a-different-parent + inherit + 25 + 0 + attachment + + 0 + http://thisurlisinvalid.notarealdomain + + _wp_attachment_metadata + + + + _wp_attached_file + + + + _wp_attachment_image_alt + + + + diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 2e882720..9d6f911d 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -4,10 +4,12 @@ from __future__ import unicode_literals, print_function import os import re -from pelican.tools.pelican_import import wp2fields, fields2pelican, decode_wp_content, build_header +from pelican.tools.pelican_import import wp2fields, fields2pelican, decode_wp_content, build_header, build_markdown_header, get_attachments, download_attachments from pelican.tests.support import (unittest, temporary_folder, mute, skipIfNoExecutable) +from pelican.utils import slugify + CUR_DIR = os.path.dirname(__file__) WORDPRESS_XML_SAMPLE = os.path.join(CUR_DIR, 'content', 'wordpressexport.xml') WORDPRESS_ENCODED_CONTENT_SAMPLE = os.path.join(CUR_DIR, @@ -29,6 +31,7 @@ class TestWordpressXmlImporter(unittest.TestCase): def setUp(self): self.posts = list(wp2fields(WORDPRESS_XML_SAMPLE)) + self.custposts = list(wp2fields(WORDPRESS_XML_SAMPLE, True)) def test_ignore_empty_posts(self): self.assertTrue(self.posts) @@ -54,6 +57,112 @@ class TestWordpressXmlImporter(unittest.TestCase): fname = list(silent_f2p(test_post, 'markdown', temp, dirpage=True))[0] self.assertTrue(fname.endswith('pages%sempty.md' % os.path.sep)) + def test_dircat(self): + silent_f2p = mute(True)(fields2pelican) + test_posts = [] + for post in self.posts: + # check post kind + if len(post[5]) > 0: # Has a category + test_posts.append(post) + with temporary_folder() as temp: + fnames = list(silent_f2p(test_posts, 'markdown', temp, dircat=True)) + index = 0 + for post in test_posts: + name = post[2] + category = slugify(post[5][0]) + name += '.md' + filename = os.path.join(category, name) + out_name = fnames[index] + self.assertTrue(out_name.endswith(filename)) + index += 1 + + def test_unless_custom_post_all_items_should_be_pages_or_posts(self): + self.assertTrue(self.posts) + pages_data = [] + for title, content, fname, date, author, categ, tags, kind, format in self.posts: + if kind == 'page' or kind == 'article': + pass + else: + pages_data.append((title, fname)) + self.assertEqual(0, len(pages_data)) + + def test_recognise_custom_post_type(self): + self.assertTrue(self.custposts) + cust_data = [] + for title, content, fname, date, author, categ, tags, kind, format in self.custposts: + if kind == 'article' or kind == 'page': + pass + else: + cust_data.append((title, kind)) + self.assertEqual(3, len(cust_data)) + self.assertEqual(('A custom post in category 4', 'custom1'), cust_data[0]) + self.assertEqual(('A custom post in category 5', 'custom1'), cust_data[1]) + self.assertEqual(('A 2nd custom post type also in category 5', 'custom2'), cust_data[2]) + + def test_custom_posts_put_in_own_dir(self): + silent_f2p = mute(True)(fields2pelican) + test_posts = [] + for post in self.custposts: + # check post kind + if post[7] == 'article' or post[7] == 'page': + pass + else: + test_posts.append(post) + with temporary_folder() as temp: + fnames = list(silent_f2p(test_posts, 'markdown', temp, wp_custpost = True)) + index = 0 + for post in test_posts: + name = post[2] + kind = post[7] + name += '.md' + filename = os.path.join(kind, name) + out_name = fnames[index] + self.assertTrue(out_name.endswith(filename)) + index += 1 + + def test_custom_posts_put_in_own_dir_and_catagory_sub_dir(self): + silent_f2p = mute(True)(fields2pelican) + test_posts = [] + for post in self.custposts: + # check post kind + if post[7] == 'article' or post[7] == 'page': + pass + else: + test_posts.append(post) + with temporary_folder() as temp: + fnames = list(silent_f2p(test_posts, 'markdown', temp, + wp_custpost=True, dircat=True)) + index = 0 + for post in test_posts: + name = post[2] + kind = post[7] + category = slugify(post[5][0]) + name += '.md' + filename = os.path.join(kind, category, name) + out_name = fnames[index] + self.assertTrue(out_name.endswith(filename)) + index += 1 + + def test_wp_custpost_true_dirpage_false(self): + #pages should only be put in their own directory when dirpage = True + silent_f2p = mute(True)(fields2pelican) + test_posts = [] + for post in self.custposts: + # check post kind + if post[7] == 'page': + test_posts.append(post) + with temporary_folder() as temp: + fnames = list(silent_f2p(test_posts, 'markdown', temp, + wp_custpost=True, dirpage=False)) + index = 0 + for post in test_posts: + name = post[2] + name += '.md' + filename = os.path.join('pages', name) + out_name = fnames[index] + self.assertFalse(out_name.endswith(filename)) + + def test_can_toggle_raw_html_code_parsing(self): def r(f): with open(f) as infile: @@ -137,3 +246,48 @@ class TestBuildHeader(unittest.TestCase): 'これは広い幅の文字だけで構成されたタイトルです\n' + '##############################################\n\n') + def test_galleries_added_to_header(self): + header = build_header('test', None, None, None, None, + None, ['output/test1', 'output/test2']) + self.assertEqual(header, 'test\n####\n' + ':attachments: output/test1, ' + + 'output/test2\n\n') + + def test_galleries_added_to_markdown_header(self): + header = build_markdown_header('test', None, None, None, None, None, + ['output/test1', 'output/test2']) + self.assertEqual(header, 'Title: test\n' + 'Attachments: output/test1, ' + + 'output/test2\n\n') + +@unittest.skipUnless(BeautifulSoup, 'Needs BeautifulSoup module') +class TestWordpressXMLAttachements(unittest.TestCase): + def setUp(self): + self.attachments = get_attachments(WORDPRESS_XML_SAMPLE) + + def test_recognise_attachments(self): + self.assertTrue(self.attachments) + self.assertTrue(len(self.attachments.keys()) == 3) + + def test_attachments_associated_with_correct_post(self): + self.assertTrue(self.attachments) + for post in self.attachments.keys(): + if post is None: + self.assertTrue(self.attachments[post][0] == 'https://upload.wikimedia.org/wikipedia/commons/thumb/2/2c/Pelican_lakes_entrance02.jpg/240px-Pelican_lakes_entrance02.jpg') + elif post == 'with-excerpt': + self.assertTrue(self.attachments[post][0] == 'http://thisurlisinvalid.notarealdomain/not_an_image.jpg') + self.assertTrue(self.attachments[post][1] == 'http://en.wikipedia.org/wiki/File:Pelikan_Walvis_Bay.jpg') + elif post == 'with-tags': + self.assertTrue(self.attachments[post][0] == 'http://thisurlisinvalid.notarealdomain') + else: + self.fail('all attachments should match to a filename or None, {}'.format(post)) + + def test_download_attachments(self): + real_file = os.path.join(CUR_DIR, 'content/article.rst') + good_url = 'file://' + real_file + bad_url = 'http://www.notarealsite.notarealdomain/not_a_file.txt' + silent_da = mute()(download_attachments) + with temporary_folder() as temp: + #locations = download_attachments(temp, [good_url, bad_url]) + locations = list(silent_da(temp, [good_url, bad_url])) + self.assertTrue(len(locations) == 1) + directory = locations[0] + self.assertTrue(directory.endswith('content/article.rst')) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index e3599f0e..b04906c0 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -6,9 +6,15 @@ import argparse try: # py3k import from html.parser import HTMLParser + from urllib.request import urlretrieve + from urllib.parse import urlparse + from urllib.error import URLError except ImportError: # py2 import from HTMLParser import HTMLParser # NOQA + from urllib import urlretrieve + from urlparse import urlparse + from urllib2 import URLError import os import re import subprocess @@ -96,22 +102,30 @@ def decode_wp_content(content, br=True): return content - -def wp2fields(xml): - """Opens a wordpress XML file, and yield Pelican fields""" +def get_items(xml): + """Opens a wordpress xml file and returns a list of items""" try: from bs4 import BeautifulSoup except ImportError: error = ('Missing dependency ' '"BeautifulSoup4" and "lxml" required to import Wordpress XML files.') sys.exit(error) - - with open(xml, encoding='utf-8') as infile: xmlfile = infile.read() soup = BeautifulSoup(xmlfile, "xml") items = soup.rss.channel.findAll('item') + return items +def get_filename(filename, post_id): + if filename is not None: + return filename + else: + return post_id + +def wp2fields(xml, wp_custpost=False): + """Opens a wordpress XML file, and yield Pelican fields""" + + items = get_items(xml) for item in items: if item.find('status').string == "publish": @@ -123,26 +137,36 @@ def wp2fields(xml): title = 'No title [%s]' % item.find('post_name').string logger.warn('Post "%s" is lacking a proper title' % title) - content = item.find('encoded').string filename = item.find('post_name').string - - if filename is None: - filename = item.find('post_id').string - + post_id = item.find('post_id').string + filename = get_filename(filename, post_id) + + content = item.find('encoded').string raw_date = item.find('post_date').string date_object = time.strptime(raw_date, "%Y-%m-%d %H:%M:%S") date = time.strftime("%Y-%m-%d %H:%M", date_object) author = item.find('creator').string - + categories = [cat.string for cat in item.findAll('category', {'domain' : 'category'})] # caturl = [cat['nicename'] for cat in item.find(domain='category')] tags = [tag.string for tag in item.findAll('category', {'domain' : 'post_tag'})] kind = 'article' - if item.find('post_type').string == 'page': + post_type = item.find('post_type').string + if post_type == 'page': kind = 'page' - + elif wp_custpost: + if post_type == 'post': + pass + # Old behaviour was to name everything not a page as an article. + # Theoretically all attachments have status == inherit so + # no attachments should be here. But this statement is to + # maintain existing behaviour in case that doesn't hold true. + elif post_type == 'attachment': + pass + else: + kind = post_type yield (title, content, filename, date, author, categories, tags, kind, "wp-html") @@ -410,7 +434,6 @@ def tumblr2fields(api_key, blogname): offset += len(posts) posts = get_tumblr_posts(api_key, blogname, offset) - def feed2fields(file): """Read a feed and yield pelican fields""" import feedparser @@ -426,8 +449,7 @@ def feed2fields(file): yield (entry.title, entry.description, slug, date, author, [], tags, kind, "html") - -def build_header(title, date, author, categories, tags, slug): +def build_header(title, date, author, categories, tags, slug, attachments=None): from docutils.utils import column_width """Build a header from a list of fields""" @@ -442,10 +464,13 @@ def build_header(title, date, author, categories, tags, slug): header += ':tags: %s\n' % ', '.join(tags) if slug: header += ':slug: %s\n' % slug + if attachments: + header += ':attachments: %s\n' % ', '.join(attachments) header += '\n' return header -def build_markdown_header(title, date, author, categories, tags, slug): +def build_markdown_header(title, date, author, categories, tags, slug, + attachments=None): """Build a header from a list of fields""" header = 'Title: %s\n' % title if date: @@ -458,51 +483,162 @@ def build_markdown_header(title, date, author, categories, tags, slug): header += 'Tags: %s\n' % ', '.join(tags) if slug: header += 'Slug: %s\n' % slug + if attachments: + header += 'Attachments: %s\n' % ', '.join(attachments) header += '\n' return header +def get_ext(out_markup, in_markup='html'): + if in_markup == 'markdown' or out_markup == 'markdown': + ext = '.md' + else: + ext = '.rst' + return ext + +def get_out_filename(output_path, filename, ext, kind, + dirpage, dircat, categories, wp_custpost): + filename = os.path.basename(filename) + + # Enforce filename restrictions for various filesystems at once; see + # http://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words + # we do not need to filter words because an extension will be appended + filename = re.sub(r'[<>:"/\\|?*^% ]', '-', filename) # invalid chars + filename = filename.lstrip('.') # should not start with a dot + if not filename: + filename = '_' + filename = filename[:249] # allow for 5 extra characters + + out_filename = os.path.join(output_path, filename+ext) + # option to put page posts in pages/ subdirectory + if dirpage and kind == 'page': + pages_dir = os.path.join(output_path, 'pages') + if not os.path.isdir(pages_dir): + os.mkdir(pages_dir) + out_filename = os.path.join(pages_dir, filename+ext) + elif not dirpage and kind == 'page': + pass + # option to put wp custom post types in directories with post type + # names. Custom post types can also have categories so option to + # create subdirectories with category names + elif kind != 'article': + if wp_custpost: + typename = slugify(kind) + else: + typename = '' + kind = 'article' + if dircat and (len(categories) > 0): + catname = slugify(categories[0]) + else: + catname = '' + out_filename = os.path.join(output_path, typename, + catname, filename+ext) + if not os.path.isdir(os.path.join(output_path, typename, catname)): + os.makedirs(os.path.join(output_path, typename, catname)) + # option to put files in directories with categories names + elif dircat and (len(categories) > 0): + catname = slugify(categories[0]) + out_filename = os.path.join(output_path, catname, filename+ext) + if not os.path.isdir(os.path.join(output_path, catname)): + os.mkdir(os.path.join(output_path, catname)) + + return out_filename + +def get_attachments(xml): + """returns a dictionary of posts that have attachments with a list + of the attachment_urls + """ + items = get_items(xml) + names = {} + attachments = [] + + for item in items: + kind = item.find('post_type').string + filename = item.find('post_name').string + post_id = item.find('post_id').string + + if kind == 'attachment': + attachments.append((item.find('post_parent').string, + item.find('attachment_url').string)) + else: + filename = get_filename(filename, post_id) + names[post_id] = filename + attachedposts = {} + for parent, url in attachments: + try: + parent_name = names[parent] + except KeyError: + #attachment's parent is not a valid post + parent_name = None + + try: + attachedposts[parent_name].append(url) + except KeyError: + attachedposts[parent_name] = [] + attachedposts[parent_name].append(url) + return attachedposts + +def download_attachments(output_path, urls): + """Downloads wordpress attachments and returns a list of paths to + attachments that can be associated with a post (relative path to output + directory). Files that fail to download, will not be added to posts""" + locations = [] + for url in urls: + path = urlparse(url).path + #teardown path and rebuild to negate any errors with + #os.path.join and leading /'s + path = path.split('/') + filename = path.pop(-1) + localpath = '' + for item in path: + localpath = os.path.join(localpath, item) + full_path = os.path.join(output_path, localpath) + if not os.path.exists(full_path): + os.makedirs(full_path) + print('downloading {}'.format(filename)) + try: + urlretrieve(url, os.path.join(full_path, filename)) + locations.append(os.path.join(localpath, filename)) + except URLError as e: + error = ("No file could be downloaded from {}; Error {}" + .format(url, e)) + logger.warn(error) + except IOError as e: #Python 2.7 throws an IOError rather Than URLError + error = ("No file could be downloaded from {}; Error {}" + .format(url, e)) + logger.warn(error) + return locations + + def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=False, disable_slugs=False, - dirpage=False, filename_template=None, filter_author=None): + dirpage=False, filename_template=None, filter_author=None, + wp_custpost=False, wp_attach=False, attachments=None): for (title, content, filename, date, author, categories, tags, kind, in_markup) in fields: if filter_author and filter_author != author: continue slug = not disable_slugs and filename or None - if (in_markup == "markdown") or (out_markup == "markdown") : - ext = '.md' - header = build_markdown_header(title, date, author, categories, tags, slug) + + if wp_attach and attachments: + try: + urls = attachments[filename] + attached_files = download_attachments(output_path, urls) + except KeyError: + attached_files = None + else: + attached_files = None + + ext = get_ext(out_markup, in_markup) + if ext == '.md': + header = build_markdown_header(title, date, author, categories, + tags, slug, attached_files) else: out_markup = "rst" - ext = '.rst' - header = build_header(title, date, author, categories, tags, slug) - - filename = os.path.basename(filename) - - # Enforce filename restrictions for various filesystems at once; see - # http://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words - # we do not need to filter words because an extension will be appended - filename = re.sub(r'[<>:"/\\|?*^% ]', '-', filename) # invalid chars - filename = filename.lstrip('.') # should not start with a dot - if not filename: - filename = '_' - filename = filename[:249] # allow for 5 extra characters - - # option to put page posts in pages/ subdirectory - if dirpage and kind == 'page': - pages_dir = os.path.join(output_path, 'pages') - if not os.path.isdir(pages_dir): - os.mkdir(pages_dir) - out_filename = os.path.join(pages_dir, filename+ext) - # option to put files in directories with categories names - elif dircat and (len(categories) > 0): - catname = slugify(categories[0]) - out_filename = os.path.join(output_path, catname, filename+ext) - if not os.path.isdir(os.path.join(output_path, catname)): - os.mkdir(os.path.join(output_path, catname)) - else: - out_filename = os.path.join(output_path, filename+ext) + header = build_header(title, date, author, categories, + tags, slug, attached_files) + out_filename = get_out_filename(output_path, filename, ext, + kind, dirpage, dircat, categories, wp_custpost) print(out_filename) if in_markup in ("html", "wp-html"): @@ -550,8 +686,11 @@ def fields2pelican(fields, out_markup, output_path, with open(out_filename, 'w', encoding='utf-8') as fs: fs.write(header + content) - - + if wp_attach and attachments and None in attachments: + print("downloading attachments that don't have a parent post") + urls = attachments[None] + orphan_galleries = download_attachments(output_path, urls) + def main(): parser = argparse.ArgumentParser( description="Transform feed, WordPress, Tumblr, Dotclear, or Posterous " @@ -584,6 +723,19 @@ def main(): parser.add_argument('--strip-raw', action='store_true', dest='strip_raw', help="Strip raw HTML code that can't be converted to " "markup such as flash embeds or iframes (wordpress import only)") + parser.add_argument('--wp-custpost', action='store_true', + dest='wp_custpost', + help='Put wordpress custom post types in directories. If used with ' + '--dir-cat option directories will be created as ' + '/post_type/category/ (wordpress import only)') + parser.add_argument('--wp-attach', action='store_true', dest='wp_attach', + help='(wordpress import only) Download files uploaded to wordpress as ' + 'attachments. Files will be added to posts as a list in the post ' + 'header. All files will be downloaded, even if ' + "they aren't associated with a post. Files with be downloaded " + 'with their original path inside the output directory. ' + 'e.g. output/wp-uploads/date/postname/file.jpg ' + '-- Requires an internet connection --') parser.add_argument('--disable-slugs', action='store_true', dest='disable_slugs', help='Disable storing slugs from imported posts within output. ' @@ -620,8 +772,12 @@ def main(): error = "Unable to create the output folder: " + args.output exit(error) + if args.wp_attach and input_type != 'wordpress': + error = "You must be importing a wordpress xml to use the --wp-attach option" + exit(error) + if input_type == 'wordpress': - fields = wp2fields(args.input) + fields = wp2fields(args.input, args.wp_custpost or False) elif input_type == 'dotclear': fields = dc2fields(args.input) elif input_type == 'posterous': @@ -631,6 +787,11 @@ def main(): elif input_type == 'feed': fields = feed2fields(args.input) + if args.wp_attach: + attachments = get_attachments(args.input) + else: + attachments = None + init() # init logging fields2pelican(fields, args.markup, args.output, @@ -638,4 +799,7 @@ def main(): dirpage=args.dirpage or False, strip_raw=args.strip_raw or False, disable_slugs=args.disable_slugs or False, - filter_author=args.author) + filter_author=args.author, + wp_custpost = args.wp_custpost or False, + wp_attach = args.wp_attach or False, + attachments = attachments or None)