diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 6af59212..a98644bf 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -377,9 +377,13 @@ class TestWordpressXMLAttachements(unittest.TestCase): good_url = path_to_file_url(real_file) bad_url = 'http://localhost:1/not_a_file.txt' silent_da = mute()(download_attachments) + url_cache = set() with temporary_folder() as temp: - locations = list(silent_da(temp, [good_url, bad_url])) + locations = list(silent_da(temp, [good_url, bad_url], url_cache)) self.assertEqual(1, len(locations)) + # only cache successful retrivals + self.assertTrue(good_url in url_cache) + self.assertFalse(bad_url in url_cache) directory = locations[0] self.assertTrue( directory.endswith(os.path.join('content', 'article.rst')), diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index a36e89d5..8333b90c 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -647,7 +647,7 @@ def get_attachments(xml): return attachedposts -def download_attachments(output_path, urls): +def download_attachments(output_path, urls, successful_url_cache): """Downloads WordPress attachments and returns a list of paths to attachments that can be associated with a post (relative path to output directory). Files that fail to download, will not be added to posts""" @@ -667,8 +667,12 @@ def download_attachments(output_path, urls): os.makedirs(full_path) print('downloading {}'.format(filename)) try: - quote_url = quote(url, safe="%/:=&?~#+!$,;'@()*[]") - urlretrieve(quote_url, os.path.join(full_path, filename)) + if(url in successful_url_cache): + print('already downloaded: {} ... skipping'.format(filename)) + else: + quote_url = quote(url, safe="%/:=&?~#+!$,;'@()*[]") + urlretrieve(quote_url, os.path.join(full_path, filename)) + successful_url_cache.add(url) locations.append(os.path.join(localpath, filename)) except (URLError, IOError) as e: # Python 2.7 throws an IOError rather Than URLError @@ -681,6 +685,9 @@ def fields2pelican( dircat=False, strip_raw=False, disable_slugs=False, dirpage=False, filename_template=None, filter_author=None, wp_custpost=False, wp_attach=False, attachments=None): + + successful_url_cache = set() + for (title, content, filename, date, author, categories, tags, status, kind, in_markup) in fields: if filter_author and filter_author != author: @@ -690,7 +697,8 @@ def fields2pelican( if wp_attach and attachments: try: urls = attachments[filename] - attached_files = download_attachments(output_path, urls) + attached_files = download_attachments(output_path, urls, + successful_url_cache) except KeyError: attached_files = None else: @@ -759,8 +767,8 @@ def fields2pelican( fs.write(header + content) if wp_attach and attachments and None in attachments: print("downloading attachments that don't have a parent post") - urls = attachments[None] - download_attachments(output_path, urls) + urls = set(attachments[None]) + download_attachments(output_path, urls, successful_url_cache) def main():