From 4648dfac91382a1ca9abdbaf177d1eb8c2908cdd Mon Sep 17 00:00:00 2001
From: Peter Dahlberg <catdog2@tuxzone.org>
Date: Mon, 9 May 2016 02:44:46 +0200
Subject: [PATCH] Avoid downloading the same URL over and over again

If multiple wordpress attachments point to the
same URL the importer redownloaded it. This is
unneccessary and time consuming so do it
only once.
---
 pelican/tests/test_importer.py  |  6 +++++-
 pelican/tools/pelican_import.py | 20 ++++++++++++++------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py
index 6af59212..a98644bf 100644
--- a/pelican/tests/test_importer.py
+++ b/pelican/tests/test_importer.py
@@ -377,9 +377,13 @@ class TestWordpressXMLAttachements(unittest.TestCase):
         good_url = path_to_file_url(real_file)
         bad_url = 'http://localhost:1/not_a_file.txt'
         silent_da = mute()(download_attachments)
+        url_cache = set()
         with temporary_folder() as temp:
-            locations = list(silent_da(temp, [good_url, bad_url]))
+            locations = list(silent_da(temp, [good_url, bad_url], url_cache))
             self.assertEqual(1, len(locations))
+            # only cache successful retrivals
+            self.assertTrue(good_url in url_cache)
+            self.assertFalse(bad_url in url_cache)
             directory = locations[0]
             self.assertTrue(
                 directory.endswith(os.path.join('content', 'article.rst')),
diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py
index a36e89d5..8333b90c 100755
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@@ -647,7 +647,7 @@ def get_attachments(xml):
     return attachedposts
 
 
-def download_attachments(output_path, urls):
+def download_attachments(output_path, urls, successful_url_cache):
     """Downloads WordPress attachments and returns a list of paths to
     attachments that can be associated with a post (relative path to output
     directory). Files that fail to download, will not be added to posts"""
@@ -667,8 +667,12 @@ def download_attachments(output_path, urls):
             os.makedirs(full_path)
         print('downloading {}'.format(filename))
         try:
-            quote_url = quote(url, safe="%/:=&?~#+!$,;'@()*[]")
-            urlretrieve(quote_url, os.path.join(full_path, filename))
+            if(url in successful_url_cache):
+                print('already downloaded: {} ... skipping'.format(filename))
+            else:
+                quote_url = quote(url, safe="%/:=&?~#+!$,;'@()*[]")
+                urlretrieve(quote_url, os.path.join(full_path, filename))
+                successful_url_cache.add(url)
             locations.append(os.path.join(localpath, filename))
         except (URLError, IOError) as e:
             # Python 2.7 throws an IOError rather Than URLError
@@ -681,6 +685,9 @@ def fields2pelican(
         dircat=False, strip_raw=False, disable_slugs=False,
         dirpage=False, filename_template=None, filter_author=None,
         wp_custpost=False, wp_attach=False, attachments=None):
+
+    successful_url_cache = set()
+
     for (title, content, filename, date, author, categories, tags, status,
             kind, in_markup) in fields:
         if filter_author and filter_author != author:
@@ -690,7 +697,8 @@ def fields2pelican(
         if wp_attach and attachments:
             try:
                 urls = attachments[filename]
-                attached_files = download_attachments(output_path, urls)
+                attached_files = download_attachments(output_path, urls,
+                                                      successful_url_cache)
             except KeyError:
                 attached_files = None
         else:
@@ -759,8 +767,8 @@ def fields2pelican(
             fs.write(header + content)
     if wp_attach and attachments and None in attachments:
         print("downloading attachments that don't have a parent post")
-        urls = attachments[None]
-        download_attachments(output_path, urls)
+        urls = set(attachments[None])
+        download_attachments(output_path, urls, successful_url_cache)
 
 
 def main():