Merge pull request #2452 from stuaxo/patch-6

Importer: Avoid downloading duplicate post attachments
This commit is contained in:
Justin Mayer 2018-11-26 08:12:54 -08:00 committed by GitHub
commit 3596e04639
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 11 additions and 9 deletions

View file

@ -417,20 +417,22 @@ class TestWordpressXMLAttachements(unittest.TestCase):
self.assertTrue(self.attachments)
for post in self.attachments.keys():
if post is None:
expected = ('https://upload.wikimedia.org/wikipedia/commons/'
'thumb/2/2c/Pelican_lakes_entrance02.jpg/'
'240px-Pelican_lakes_entrance02.jpg')
self.assertEqual(self.attachments[post][0], expected)
expected = {
('https://upload.wikimedia.org/wikipedia/commons/'
'thumb/2/2c/Pelican_lakes_entrance02.jpg/'
'240px-Pelican_lakes_entrance02.jpg')
}
self.assertEqual(self.attachments[post], expected)
elif post == 'with-excerpt':
expected_invalid = ('http://thisurlisinvalid.notarealdomain/'
'not_an_image.jpg')
expected_pelikan = ('http://en.wikipedia.org/wiki/'
'File:Pelikan_Walvis_Bay.jpg')
self.assertEqual(self.attachments[post][0], expected_invalid)
self.assertEqual(self.attachments[post][1], expected_pelikan)
self.assertEqual(self.attachments[post],
{expected_invalid, expected_pelikan})
elif post == 'with-tags':
expected_invalid = ('http://thisurlisinvalid.notarealdomain')
self.assertEqual(self.attachments[post][0], expected_invalid)
self.assertEqual(self.attachments[post], {expected_invalid})
else:
self.fail('all attachments should match to a '
'filename or None, {}'

View file

@ -699,7 +699,7 @@ def get_attachments(xml):
else:
filename = get_filename(post_name, post_id)
names[post_id] = filename
attachedposts = defaultdict(list)
attachedposts = defaultdict(set)
for parent, url in attachments:
try:
parent_name = names[parent]
@ -707,7 +707,7 @@ def get_attachments(xml):
# attachment's parent is not a valid post
parent_name = None
attachedposts[parent_name].append(url)
attachedposts[parent_name].add(url)
return attachedposts