mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
Avoid downloading the same URL over and over again
If multiple wordpress attachments point to the same URL the importer redownloaded it. This is unneccessary and time consuming so do it only once.
This commit is contained in:
parent
00c455a748
commit
4648dfac91
2 changed files with 19 additions and 7 deletions
|
|
@ -377,9 +377,13 @@ class TestWordpressXMLAttachements(unittest.TestCase):
|
|||
good_url = path_to_file_url(real_file)
|
||||
bad_url = 'http://localhost:1/not_a_file.txt'
|
||||
silent_da = mute()(download_attachments)
|
||||
url_cache = set()
|
||||
with temporary_folder() as temp:
|
||||
locations = list(silent_da(temp, [good_url, bad_url]))
|
||||
locations = list(silent_da(temp, [good_url, bad_url], url_cache))
|
||||
self.assertEqual(1, len(locations))
|
||||
# only cache successful retrivals
|
||||
self.assertTrue(good_url in url_cache)
|
||||
self.assertFalse(bad_url in url_cache)
|
||||
directory = locations[0]
|
||||
self.assertTrue(
|
||||
directory.endswith(os.path.join('content', 'article.rst')),
|
||||
|
|
|
|||
|
|
@ -647,7 +647,7 @@ def get_attachments(xml):
|
|||
return attachedposts
|
||||
|
||||
|
||||
def download_attachments(output_path, urls):
|
||||
def download_attachments(output_path, urls, successful_url_cache):
|
||||
"""Downloads WordPress attachments and returns a list of paths to
|
||||
attachments that can be associated with a post (relative path to output
|
||||
directory). Files that fail to download, will not be added to posts"""
|
||||
|
|
@ -667,8 +667,12 @@ def download_attachments(output_path, urls):
|
|||
os.makedirs(full_path)
|
||||
print('downloading {}'.format(filename))
|
||||
try:
|
||||
quote_url = quote(url, safe="%/:=&?~#+!$,;'@()*[]")
|
||||
urlretrieve(quote_url, os.path.join(full_path, filename))
|
||||
if(url in successful_url_cache):
|
||||
print('already downloaded: {} ... skipping'.format(filename))
|
||||
else:
|
||||
quote_url = quote(url, safe="%/:=&?~#+!$,;'@()*[]")
|
||||
urlretrieve(quote_url, os.path.join(full_path, filename))
|
||||
successful_url_cache.add(url)
|
||||
locations.append(os.path.join(localpath, filename))
|
||||
except (URLError, IOError) as e:
|
||||
# Python 2.7 throws an IOError rather Than URLError
|
||||
|
|
@ -681,6 +685,9 @@ def fields2pelican(
|
|||
dircat=False, strip_raw=False, disable_slugs=False,
|
||||
dirpage=False, filename_template=None, filter_author=None,
|
||||
wp_custpost=False, wp_attach=False, attachments=None):
|
||||
|
||||
successful_url_cache = set()
|
||||
|
||||
for (title, content, filename, date, author, categories, tags, status,
|
||||
kind, in_markup) in fields:
|
||||
if filter_author and filter_author != author:
|
||||
|
|
@ -690,7 +697,8 @@ def fields2pelican(
|
|||
if wp_attach and attachments:
|
||||
try:
|
||||
urls = attachments[filename]
|
||||
attached_files = download_attachments(output_path, urls)
|
||||
attached_files = download_attachments(output_path, urls,
|
||||
successful_url_cache)
|
||||
except KeyError:
|
||||
attached_files = None
|
||||
else:
|
||||
|
|
@ -759,8 +767,8 @@ def fields2pelican(
|
|||
fs.write(header + content)
|
||||
if wp_attach and attachments and None in attachments:
|
||||
print("downloading attachments that don't have a parent post")
|
||||
urls = attachments[None]
|
||||
download_attachments(output_path, urls)
|
||||
urls = set(attachments[None])
|
||||
download_attachments(output_path, urls, successful_url_cache)
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue