From 4648dfac91382a1ca9abdbaf177d1eb8c2908cdd Mon Sep 17 00:00:00 2001 From: Peter Dahlberg Date: Mon, 9 May 2016 02:44:46 +0200 Subject: [PATCH] Avoid downloading the same URL over and over again If multiple wordpress attachments point to the same URL the importer redownloaded it. This is unneccessary and time consuming so do it only once. --- pelican/tests/test_importer.py | 6 +++++- pelican/tools/pelican_import.py | 20 ++++++++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 6af59212..a98644bf 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -377,9 +377,13 @@ class TestWordpressXMLAttachements(unittest.TestCase): good_url = path_to_file_url(real_file) bad_url = 'http://localhost:1/not_a_file.txt' silent_da = mute()(download_attachments) + url_cache = set() with temporary_folder() as temp: - locations = list(silent_da(temp, [good_url, bad_url])) + locations = list(silent_da(temp, [good_url, bad_url], url_cache)) self.assertEqual(1, len(locations)) + # only cache successful retrivals + self.assertTrue(good_url in url_cache) + self.assertFalse(bad_url in url_cache) directory = locations[0] self.assertTrue( directory.endswith(os.path.join('content', 'article.rst')), diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index a36e89d5..8333b90c 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -647,7 +647,7 @@ def get_attachments(xml): return attachedposts -def download_attachments(output_path, urls): +def download_attachments(output_path, urls, successful_url_cache): """Downloads WordPress attachments and returns a list of paths to attachments that can be associated with a post (relative path to output directory). Files that fail to download, will not be added to posts""" @@ -667,8 +667,12 @@ def download_attachments(output_path, urls): os.makedirs(full_path) print('downloading {}'.format(filename)) try: - quote_url = quote(url, safe="%/:=&?~#+!$,;'@()*[]") - urlretrieve(quote_url, os.path.join(full_path, filename)) + if(url in successful_url_cache): + print('already downloaded: {} ... skipping'.format(filename)) + else: + quote_url = quote(url, safe="%/:=&?~#+!$,;'@()*[]") + urlretrieve(quote_url, os.path.join(full_path, filename)) + successful_url_cache.add(url) locations.append(os.path.join(localpath, filename)) except (URLError, IOError) as e: # Python 2.7 throws an IOError rather Than URLError @@ -681,6 +685,9 @@ def fields2pelican( dircat=False, strip_raw=False, disable_slugs=False, dirpage=False, filename_template=None, filter_author=None, wp_custpost=False, wp_attach=False, attachments=None): + + successful_url_cache = set() + for (title, content, filename, date, author, categories, tags, status, kind, in_markup) in fields: if filter_author and filter_author != author: @@ -690,7 +697,8 @@ def fields2pelican( if wp_attach and attachments: try: urls = attachments[filename] - attached_files = download_attachments(output_path, urls) + attached_files = download_attachments(output_path, urls, + successful_url_cache) except KeyError: attached_files = None else: @@ -759,8 +767,8 @@ def fields2pelican( fs.write(header + content) if wp_attach and attachments and None in attachments: print("downloading attachments that don't have a parent post") - urls = attachments[None] - download_attachments(output_path, urls) + urls = set(attachments[None]) + download_attachments(output_path, urls, successful_url_cache) def main():