diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 25fc45e5..264d7d36 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -12,6 +12,7 @@ import time from codecs import open +import collections from six.moves.urllib.error import URLError from six.moves.urllib.parse import urlparse from six.moves.urllib.request import urlretrieve @@ -29,11 +30,22 @@ except ImportError: logger = logging.getLogger(__name__) -def decode_wp_content(content, br=True): +def decode_wp_content(content, attached_files, br=True): pre_tags = {} if content.strip() == "": return "" + if attached_files: + for path, urls in attached_files.items(): + for url in urls: + content = re.sub(r'(]*href=")%s(")' % url, + r'\1%s\2' % path, + content) + + content = re.sub(r'(]*src=")%s(")' % url, + r'\1%s\2' % path, + content) + content += "\n" if "") @@ -651,10 +663,13 @@ def get_attachments(xml): def download_attachments(output_path, urls): - """Downloads WordPress attachments and returns a list of paths to - attachments that can be associated with a post (relative path to output - directory). Files that fail to download, will not be added to posts""" - locations = [] + """Downloads WordPress attachments and returns a returns a dict {url:path} of + attachments that can be associated with a post (relative path to output + directory). Files that fail to download, will not be added to posts + + {relpath: {set of urls}} + """ + locations = collections.defaultdict(set) for url in urls: path = urlparse(url).path # teardown path and rebuild to negate any errors with @@ -671,7 +686,8 @@ def download_attachments(output_path, urls): print('downloading {}'.format(filename)) try: urlretrieve(url, os.path.join(full_path, filename)) - locations.append(os.path.join(localpath, filename)) + relpath = os.path.join(localpath, filename) + locations[relpath].add(url) except (URLError, IOError) as e: # Python 2.7 throws an IOError rather Than URLError logger.warning("No file could be downloaded from %s\n%s", url, e) @@ -720,7 +736,7 @@ def fields2pelican( # Replace newlines with paragraphs wrapped with

so # HTML is valid before conversion if in_markup == 'wp-html': - new_content = decode_wp_content(content) + new_content = decode_wp_content(content, attached_files) else: paragraphs = content.splitlines() paragraphs = ['

{0}

'.format(p) for p in paragraphs]