From fe2eb86fafd4fd655847b95a2ae12868b70926de Mon Sep 17 00:00:00 2001 From: Stuart Axon Date: Thu, 12 Oct 2017 00:45:59 +0100 Subject: [PATCH 1/5] pelican-import, fix urls of attached files This patch wp-attach modifies ```]*href=")%s(")' % url, + r'\1/%s\2' % path, + content) + + content = re.sub(r'(]*src=")%s(")' % url, + r'\1/%s\2' % path, + content) + content += "\n" if "") @@ -651,10 +663,13 @@ def get_attachments(xml): def download_attachments(output_path, urls): - """Downloads WordPress attachments and returns a list of paths to - attachments that can be associated with a post (relative path to output - directory). Files that fail to download, will not be added to posts""" - locations = [] + """Downloads WordPress attachments and returns a returns a dict {url:path} of + attachments that can be associated with a post (relative path to output + directory). Files that fail to download, will not be added to posts + + {relpath: {set of urls}} + """ + locations = collections.defaultdict(set) for url in urls: path = urlparse(url).path # teardown path and rebuild to negate any errors with @@ -671,7 +686,8 @@ def download_attachments(output_path, urls): print('downloading {}'.format(filename)) try: urlretrieve(url, os.path.join(full_path, filename)) - locations.append(os.path.join(localpath, filename)) + relpath = os.path.join(localpath, filename) + locations[relpath].add(url) except (URLError, IOError) as e: # Python 2.7 throws an IOError rather Than URLError logger.warning("No file could be downloaded from %s\n%s", url, e) @@ -720,7 +736,7 @@ def fields2pelican( # Replace newlines with paragraphs wrapped with

so # HTML is valid before conversion if in_markup == 'wp-html': - new_content = decode_wp_content(content) + new_content = decode_wp_content(content, attached_files) else: paragraphs = content.splitlines() paragraphs = ['

{0}

'.format(p) for p in paragraphs] From c776cb2234817ae76d659329a245f7b36a1146f4 Mon Sep 17 00:00:00 2001 From: Stuart Axon Date: Fri, 13 Oct 2017 01:05:22 +0100 Subject: [PATCH 2/5] Process URLs after wordpress XML has been converted output format Don't download files more than once Handle post names that are just spaces --- pelican/tests/test_importer.py | 6 +---- pelican/tools/pelican_import.py | 42 ++++++++++++++++----------------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 63d17539..6af59212 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -234,11 +234,7 @@ class TestWordpressXmlImporter(unittest.TestCase): with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file: decoded_content = decoded_file.read() self.assertEqual( - decode_wp_content( - encoded_content, - attached_files=None, - br=False - ), + decode_wp_content(encoded_content, br=False), decoded_content) def test_preserve_verbatim_formatting(self): diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 26df1a4d..fa5d85b9 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -30,22 +30,11 @@ except ImportError: logger = logging.getLogger(__name__) -def decode_wp_content(content, attached_files=None, br=True): +def decode_wp_content(content, br=True): pre_tags = {} if content.strip() == "": return "" - if attached_files: - for path, urls in attached_files.items(): - for url in urls: - content = re.sub(r'(]*href=")%s(")' % url, - r'\1/%s\2' % path, - content) - - content = re.sub(r'(]*src=")%s(")' % url, - r'\1/%s\2' % path, - content) - content += "\n" if "") @@ -144,9 +133,9 @@ def get_items(xml): return items -def get_filename(filename, post_id): - if filename is not None: - return filename +def get_filename(post_name, post_id): + if post_name and not post_name.isspace(): + return post_name else: return post_id @@ -166,9 +155,9 @@ def wp2fields(xml, wp_custpost=False): title = 'No title [%s]' % item.find('post_name').string logger.warning('Post "%s" is lacking a proper title', title) - filename = item.find('post_name').string + post_name = item.find('post_name').string post_id = item.find('post_id').string - filename = get_filename(filename, post_id) + filename = get_filename(post_name, post_id) content = item.find('encoded').string raw_date = item.find('post_date').string @@ -637,14 +626,14 @@ def get_attachments(xml): for item in items: kind = item.find('post_type').string - filename = item.find('post_name').string + post_name = item.find('post_name').string post_id = item.find('post_id').string if kind == 'attachment': attachments.append((item.find('post_parent').string, item.find('attachment_url').string)) else: - filename = get_filename(filename, post_id) + filename = get_filename(post_name, post_id) names[post_id] = filename attachedposts = {} for parent, url in attachments: @@ -683,10 +672,13 @@ def download_attachments(output_path, urls): full_path = os.path.join(output_path, localpath) if not os.path.exists(full_path): os.makedirs(full_path) + relpath = os.path.join(localpath, filename) + if url in locations[relpath]: + continue + print('downloading {}'.format(filename)) try: urlretrieve(url, os.path.join(full_path, filename)) - relpath = os.path.join(localpath, filename) locations[relpath].add(url) except (URLError, IOError) as e: # Python 2.7 throws an IOError rather Than URLError @@ -736,7 +728,7 @@ def fields2pelican( # Replace newlines with paragraphs wrapped with

so # HTML is valid before conversion if in_markup == 'wp-html': - new_content = decode_wp_content(content, attached_files) + new_content = decode_wp_content(content) else: paragraphs = content.splitlines() paragraphs = ['

{0}

'.format(p) for p in paragraphs] @@ -767,6 +759,14 @@ def fields2pelican( with open(out_filename, 'r', encoding='utf-8') as fs: content = fs.read() + + if attached_files: + for path, urls in attached_files.items(): + for url in urls: + content = re.sub(url, + r'{filename}/%s' % path, + content) + if out_markup == 'markdown': # In markdown, to insert a
, end a line with two # or more spaces & then a end-of-line From 310ae7edeea19a051f112b85a781cb21f3c8627b Mon Sep 17 00:00:00 2001 From: Stuart Axon Date: Sat, 14 Oct 2017 17:35:51 +0100 Subject: [PATCH 3/5] Update pelican_import.py Link to ```{filename}/path/to/file``` --- pelican/tools/pelican_import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index fa5d85b9..a8e323fb 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -764,7 +764,7 @@ def fields2pelican( for path, urls in attached_files.items(): for url in urls: content = re.sub(url, - r'{filename}/%s' % path, + r'/{filename}/%s' % path, content) if out_markup == 'markdown': From ea1104061a33b5c804b89958f5025ed4000d369f Mon Sep 17 00:00:00 2001 From: Stuart Axon Date: Sat, 14 Oct 2017 18:24:43 +0100 Subject: [PATCH 4/5] Update test_importer.py Try fix for download_attachments test --- pelican/tests/test_importer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 6af59212..f2c704c3 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -378,7 +378,7 @@ class TestWordpressXMLAttachements(unittest.TestCase): bad_url = 'http://localhost:1/not_a_file.txt' silent_da = mute()(download_attachments) with temporary_folder() as temp: - locations = list(silent_da(temp, [good_url, bad_url])) + locations = list(silent_da(temp, [good_url, bad_url]).keys()) self.assertEqual(1, len(locations)) directory = locations[0] self.assertTrue( From 2ece08afafdcb3d246a8c1596f9aeafeed9fa7bf Mon Sep 17 00:00:00 2001 From: Stuart Axon Date: Sat, 14 Oct 2017 19:46:16 +0100 Subject: [PATCH 5/5] Update pelican_import.py Don't create empty sets in download_attachments, this might fix an issue that came in the tests. --- pelican/tools/pelican_import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index a8e323fb..6fddb2be 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -673,7 +673,7 @@ def download_attachments(output_path, urls): if not os.path.exists(full_path): os.makedirs(full_path) relpath = os.path.join(localpath, filename) - if url in locations[relpath]: + if relpath in locations and url in locations[relpath]: continue print('downloading {}'.format(filename))