Process URLs after wordpress XML has been converted output format

Don't download files more than once Handle post names that are just spaces
2025-10-15 20:28:56 +02:00 · 2017-10-13 01:05:22 +01:00 · 2017-10-13 01:05:22 +01:00 · c776cb2234
commit c776cb2234
parent fe2eb86faf
2 changed files with 22 additions and 26 deletions
--- a/pelican/tests/test_importer.py
+++ b/pelican/tests/test_importer.py
@ -234,11 +234,7 @@ class TestWordpressXmlImporter(unittest.TestCase):
            with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file:
                decoded_content = decoded_file.read()
                self.assertEqual(
-                    decode_wp_content(
-                      encoded_content,
-                      attached_files=None,
-                      br=False
-                    ),
+                    decode_wp_content(encoded_content, br=False),
                    decoded_content)

    def test_preserve_verbatim_formatting(self):
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@ -30,22 +30,11 @@ except ImportError:
 logger = logging.getLogger(__name__)


-def decode_wp_content(content, attached_files=None, br=True):
+def decode_wp_content(content, br=True):
    pre_tags = {}
    if content.strip() == "":
        return ""

-    if attached_files:
-        for path, urls in attached_files.items():
-            for url in urls:
-                content = re.sub(r'(<a\s+[^>]*href=")%s(")' % url,
-                                 r'\1/%s\2' % path,
-                                 content)
-
-                content = re.sub(r'(<img\s+[^>]*src=")%s(")' % url,
-                                 r'\1/%s\2' % path,
-                                 content)
-
    content += "\n"
    if "<pre" in content:
        pre_parts = content.split("</pre>")
@ -144,9 +133,9 @@ def get_items(xml):
    return items


-def get_filename(filename, post_id):
-    if filename is not None:
-        return filename
+def get_filename(post_name, post_id):
+    if post_name and not post_name.isspace():
+        return post_name
    else:
        return post_id

@ -166,9 +155,9 @@ def wp2fields(xml, wp_custpost=False):
                title = 'No title [%s]' % item.find('post_name').string
                logger.warning('Post "%s" is lacking a proper title', title)

-            filename = item.find('post_name').string
+            post_name = item.find('post_name').string
            post_id = item.find('post_id').string
-            filename = get_filename(filename, post_id)
+            filename = get_filename(post_name, post_id)

            content = item.find('encoded').string
            raw_date = item.find('post_date').string
@ -637,14 +626,14 @@ def get_attachments(xml):

    for item in items:
        kind = item.find('post_type').string
-        filename = item.find('post_name').string
+        post_name = item.find('post_name').string
        post_id = item.find('post_id').string

        if kind == 'attachment':
            attachments.append((item.find('post_parent').string,
                                item.find('attachment_url').string))
        else:
-            filename = get_filename(filename, post_id)
+            filename = get_filename(post_name, post_id)
            names[post_id] = filename
    attachedposts = {}
    for parent, url in attachments:
@ -683,10 +672,13 @@ def download_attachments(output_path, urls):
        full_path = os.path.join(output_path, localpath)
        if not os.path.exists(full_path):
            os.makedirs(full_path)
+        relpath = os.path.join(localpath, filename)
+        if url in locations[relpath]:
+            continue
+
        print('downloading {}'.format(filename))
        try:
            urlretrieve(url, os.path.join(full_path, filename))
-            relpath = os.path.join(localpath, filename)
            locations[relpath].add(url)
        except (URLError, IOError) as e:
            # Python 2.7 throws an IOError rather Than URLError
@ -736,7 +728,7 @@ def fields2pelican(
                # Replace newlines with paragraphs wrapped with <p> so
                # HTML is valid before conversion
                if in_markup == 'wp-html':
-                    new_content = decode_wp_content(content, attached_files)
+                    new_content = decode_wp_content(content)
                else:
                    paragraphs = content.splitlines()
                    paragraphs = ['<p>{0}</p>'.format(p) for p in paragraphs]
@ -767,6 +759,14 @@ def fields2pelican(

            with open(out_filename, 'r', encoding='utf-8') as fs:
                content = fs.read()
+
+                if attached_files:
+                    for path, urls in attached_files.items():
+                        for url in urls:
+                            content = re.sub(url,
+                                             r'{filename}/%s' % path,
+                                             content)
+
                if out_markup == 'markdown':
                    # In markdown, to insert a <br />, end a line with two
                    # or more spaces & then a end-of-line