pelican-import, fix urls of attached files

This patch wp-attach modifies ```<img``` and ```<a href``` urls to point at the attachment path - so ```http://example.com/wp-content/uploads/blah.png``` becomes ```wp-content/uploads/blah.png``` This needs wp-content in pelicanconf STATIC_FILES I thought I could use {filename} but since the importer is generates source html this doesn't work. It would be nice if there was a way to customise the output path, but not sure how that would work.
2025-10-15 20:28:56 +02:00 · 2017-10-12 00:45:59 +01:00 · 2017-10-12 00:45:59 +01:00 · fe2eb86faf
commit fe2eb86faf
parent 359ffcabb8
2 changed files with 28 additions and 8 deletions
--- a/pelican/tests/test_importer.py
+++ b/pelican/tests/test_importer.py
@ -234,7 +234,11 @@ class TestWordpressXmlImporter(unittest.TestCase):
            with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file:
                decoded_content = decoded_file.read()
                self.assertEqual(
-                    decode_wp_content(encoded_content, br=False),
+                    decode_wp_content(
                      encoded_content,
                      attached_files=None,
                      br=False
                    ),
                    decoded_content)
    def test_preserve_verbatim_formatting(self):
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@ -3,6 +3,7 @@
 from __future__ import print_function, unicode_literals
 import argparse
 import collections
 import logging
 import os
 import re
@ -29,11 +30,22 @@ except ImportError:
 logger = logging.getLogger(__name__)
-def decode_wp_content(content, br=True):
+def decode_wp_content(content, attached_files=None, br=True):
    pre_tags = {}
    if content.strip() == "":
        return ""
    if attached_files:
        for path, urls in attached_files.items():
            for url in urls:
                content = re.sub(r'(<a\s+[^>]*href=")%s(")' % url,
                                 r'\1/%s\2' % path,
                                 content)
                content = re.sub(r'(<img\s+[^>]*src=")%s(")' % url,
                                 r'\1/%s\2' % path,
                                 content)
    content += "\n"
    if "<pre" in content:
        pre_parts = content.split("</pre>")
@ -651,10 +663,13 @@ def get_attachments(xml):
 def download_attachments(output_path, urls):
-    """Downloads WordPress attachments and returns a list of paths to
+    """Downloads WordPress attachments and returns a returns a dict {url:path} of
-    attachments that can be associated with a post (relative path to output
+        attachments that can be associated with a post (relative path to output
-    directory). Files that fail to download, will not be added to posts"""
+        directory). Files that fail to download, will not be added to posts
-    locations = []
+
    {relpath: {set of urls}}
    """
    locations = collections.defaultdict(set)
    for url in urls:
        path = urlparse(url).path
        # teardown path and rebuild to negate any errors with
@ -671,7 +686,8 @@ def download_attachments(output_path, urls):
        print('downloading {}'.format(filename))
        try:
            urlretrieve(url, os.path.join(full_path, filename))
-            locations.append(os.path.join(localpath, filename))
+            relpath = os.path.join(localpath, filename)
            locations[relpath].add(url)
        except (URLError, IOError) as e:
            # Python 2.7 throws an IOError rather Than URLError
            logger.warning("No file could be downloaded from %s\n%s", url, e)
@ -720,7 +736,7 @@ def fields2pelican(
                # Replace newlines with paragraphs wrapped with <p> so
                # HTML is valid before conversion
                if in_markup == 'wp-html':
-                    new_content = decode_wp_content(content)
+                    new_content = decode_wp_content(content, attached_files)
                else:
                    paragraphs = content.splitlines()
                    paragraphs = ['<p>{0}</p>'.format(p) for p in paragraphs]