From fe2eb86fafd4fd655847b95a2ae12868b70926de Mon Sep 17 00:00:00 2001 From: Stuart Axon Date: Thu, 12 Oct 2017 00:45:59 +0100 Subject: [PATCH] pelican-import, fix urls of attached files This patch wp-attach modifies ```]*href=")%s(")' % url, + r'\1/%s\2' % path, + content) + + content = re.sub(r'(]*src=")%s(")' % url, + r'\1/%s\2' % path, + content) + content += "\n" if "") @@ -651,10 +663,13 @@ def get_attachments(xml): def download_attachments(output_path, urls): - """Downloads WordPress attachments and returns a list of paths to - attachments that can be associated with a post (relative path to output - directory). Files that fail to download, will not be added to posts""" - locations = [] + """Downloads WordPress attachments and returns a returns a dict {url:path} of + attachments that can be associated with a post (relative path to output + directory). Files that fail to download, will not be added to posts + + {relpath: {set of urls}} + """ + locations = collections.defaultdict(set) for url in urls: path = urlparse(url).path # teardown path and rebuild to negate any errors with @@ -671,7 +686,8 @@ def download_attachments(output_path, urls): print('downloading {}'.format(filename)) try: urlretrieve(url, os.path.join(full_path, filename)) - locations.append(os.path.join(localpath, filename)) + relpath = os.path.join(localpath, filename) + locations[relpath].add(url) except (URLError, IOError) as e: # Python 2.7 throws an IOError rather Than URLError logger.warning("No file could be downloaded from %s\n%s", url, e) @@ -720,7 +736,7 @@ def fields2pelican( # Replace newlines with paragraphs wrapped with

so # HTML is valid before conversion if in_markup == 'wp-html': - new_content = decode_wp_content(content) + new_content = decode_wp_content(content, attached_files) else: paragraphs = content.splitlines() paragraphs = ['

{0}

'.format(p) for p in paragraphs]