diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 6af59212..63d17539 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -234,7 +234,11 @@ class TestWordpressXmlImporter(unittest.TestCase): with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file: decoded_content = decoded_file.read() self.assertEqual( - decode_wp_content(encoded_content, br=False), + decode_wp_content( + encoded_content, + attached_files=None, + br=False + ), decoded_content) def test_preserve_verbatim_formatting(self): diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 25fc45e5..26df1a4d 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -3,6 +3,7 @@ from __future__ import print_function, unicode_literals import argparse +import collections import logging import os import re @@ -29,11 +30,22 @@ except ImportError: logger = logging.getLogger(__name__) -def decode_wp_content(content, br=True): +def decode_wp_content(content, attached_files=None, br=True): pre_tags = {} if content.strip() == "": return "" + if attached_files: + for path, urls in attached_files.items(): + for url in urls: + content = re.sub(r'(]*href=")%s(")' % url, + r'\1/%s\2' % path, + content) + + content = re.sub(r'(]*src=")%s(")' % url, + r'\1/%s\2' % path, + content) + content += "\n" if "") @@ -651,10 +663,13 @@ def get_attachments(xml): def download_attachments(output_path, urls): - """Downloads WordPress attachments and returns a list of paths to - attachments that can be associated with a post (relative path to output - directory). Files that fail to download, will not be added to posts""" - locations = [] + """Downloads WordPress attachments and returns a returns a dict {url:path} of + attachments that can be associated with a post (relative path to output + directory). Files that fail to download, will not be added to posts + + {relpath: {set of urls}} + """ + locations = collections.defaultdict(set) for url in urls: path = urlparse(url).path # teardown path and rebuild to negate any errors with @@ -671,7 +686,8 @@ def download_attachments(output_path, urls): print('downloading {}'.format(filename)) try: urlretrieve(url, os.path.join(full_path, filename)) - locations.append(os.path.join(localpath, filename)) + relpath = os.path.join(localpath, filename) + locations[relpath].add(url) except (URLError, IOError) as e: # Python 2.7 throws an IOError rather Than URLError logger.warning("No file could be downloaded from %s\n%s", url, e) @@ -720,7 +736,7 @@ def fields2pelican( # Replace newlines with paragraphs wrapped with

so # HTML is valid before conversion if in_markup == 'wp-html': - new_content = decode_wp_content(content) + new_content = decode_wp_content(content, attached_files) else: paragraphs = content.splitlines() paragraphs = ['

{0}

'.format(p) for p in paragraphs]