From c776cb2234817ae76d659329a245f7b36a1146f4 Mon Sep 17 00:00:00 2001 From: Stuart Axon Date: Fri, 13 Oct 2017 01:05:22 +0100 Subject: [PATCH] Process URLs after wordpress XML has been converted output format Don't download files more than once Handle post names that are just spaces --- pelican/tests/test_importer.py | 6 +---- pelican/tools/pelican_import.py | 42 ++++++++++++++++----------------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 63d17539..6af59212 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -234,11 +234,7 @@ class TestWordpressXmlImporter(unittest.TestCase): with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file: decoded_content = decoded_file.read() self.assertEqual( - decode_wp_content( - encoded_content, - attached_files=None, - br=False - ), + decode_wp_content(encoded_content, br=False), decoded_content) def test_preserve_verbatim_formatting(self): diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 26df1a4d..fa5d85b9 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -30,22 +30,11 @@ except ImportError: logger = logging.getLogger(__name__) -def decode_wp_content(content, attached_files=None, br=True): +def decode_wp_content(content, br=True): pre_tags = {} if content.strip() == "": return "" - if attached_files: - for path, urls in attached_files.items(): - for url in urls: - content = re.sub(r'(]*href=")%s(")' % url, - r'\1/%s\2' % path, - content) - - content = re.sub(r'(]*src=")%s(")' % url, - r'\1/%s\2' % path, - content) - content += "\n" if "") @@ -144,9 +133,9 @@ def get_items(xml): return items -def get_filename(filename, post_id): - if filename is not None: - return filename +def get_filename(post_name, post_id): + if post_name and not post_name.isspace(): + return post_name else: return post_id @@ -166,9 +155,9 @@ def wp2fields(xml, wp_custpost=False): title = 'No title [%s]' % item.find('post_name').string logger.warning('Post "%s" is lacking a proper title', title) - filename = item.find('post_name').string + post_name = item.find('post_name').string post_id = item.find('post_id').string - filename = get_filename(filename, post_id) + filename = get_filename(post_name, post_id) content = item.find('encoded').string raw_date = item.find('post_date').string @@ -637,14 +626,14 @@ def get_attachments(xml): for item in items: kind = item.find('post_type').string - filename = item.find('post_name').string + post_name = item.find('post_name').string post_id = item.find('post_id').string if kind == 'attachment': attachments.append((item.find('post_parent').string, item.find('attachment_url').string)) else: - filename = get_filename(filename, post_id) + filename = get_filename(post_name, post_id) names[post_id] = filename attachedposts = {} for parent, url in attachments: @@ -683,10 +672,13 @@ def download_attachments(output_path, urls): full_path = os.path.join(output_path, localpath) if not os.path.exists(full_path): os.makedirs(full_path) + relpath = os.path.join(localpath, filename) + if url in locations[relpath]: + continue + print('downloading {}'.format(filename)) try: urlretrieve(url, os.path.join(full_path, filename)) - relpath = os.path.join(localpath, filename) locations[relpath].add(url) except (URLError, IOError) as e: # Python 2.7 throws an IOError rather Than URLError @@ -736,7 +728,7 @@ def fields2pelican( # Replace newlines with paragraphs wrapped with

so # HTML is valid before conversion if in_markup == 'wp-html': - new_content = decode_wp_content(content, attached_files) + new_content = decode_wp_content(content) else: paragraphs = content.splitlines() paragraphs = ['

{0}

'.format(p) for p in paragraphs] @@ -767,6 +759,14 @@ def fields2pelican( with open(out_filename, 'r', encoding='utf-8') as fs: content = fs.read() + + if attached_files: + for path, urls in attached_files.items(): + for url in urls: + content = re.sub(url, + r'{filename}/%s' % path, + content) + if out_markup == 'markdown': # In markdown, to insert a
, end a line with two # or more spaces & then a end-of-line