From fe2eb86fafd4fd655847b95a2ae12868b70926de Mon Sep 17 00:00:00 2001
From: Stuart Axon <stuaxo2@yahoo.com>
Date: Thu, 12 Oct 2017 00:45:59 +0100
Subject: [PATCH 1/5] pelican-import, fix urls of attached files

This patch wp-attach modifies ```<img``` and  ```<a href``` urls to point at the attachment path - so  ```http://example.com/wp-content/uploads/blah.png```   becomes    ```wp-content/uploads/blah.png```

This needs wp-content in pelicanconf STATIC_FILES

I thought I could use {filename} but since the importer is generates source html this doesn't work.
It would be nice if there was a way to customise the output path, but not sure how that would work.
---
 pelican/tests/test_importer.py  |  6 +++++-
 pelican/tools/pelican_import.py | 30 +++++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 8 deletions(-)
diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py
index 6af59212..63d17539 100644
--- a/pelican/tests/test_importer.py
+++ b/pelican/tests/test_importer.py
@@ -234,7 +234,11 @@ class TestWordpressXmlImporter(unittest.TestCase):
             with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file:
                 decoded_content = decoded_file.read()
                 self.assertEqual(
-                    decode_wp_content(encoded_content, br=False),
+                    decode_wp_content(
+                      encoded_content,
+                      attached_files=None,
+                      br=False
+                    ),
                     decoded_content)
 
     def test_preserve_verbatim_formatting(self):
diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py
index 25fc45e5..26df1a4d 100755
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@@ -3,6 +3,7 @@
 from __future__ import print_function, unicode_literals
 
 import argparse
+import collections
 import logging
 import os
 import re
@@ -29,11 +30,22 @@ except ImportError:
 logger = logging.getLogger(__name__)
 
 
-def decode_wp_content(content, br=True):
+def decode_wp_content(content, attached_files=None, br=True):
     pre_tags = {}
     if content.strip() == "":
         return ""
 
+    if attached_files:
+        for path, urls in attached_files.items():
+            for url in urls:
+                content = re.sub(r'(<a\s+[^>]*href=")%s(")' % url,
+                                 r'\1/%s\2' % path,
+                                 content)
+
+                content = re.sub(r'(<img\s+[^>]*src=")%s(")' % url,
+                                 r'\1/%s\2' % path,
+                                 content)
+
     content += "\n"
     if "<pre" in content:
         pre_parts = content.split("</pre>")
@@ -651,10 +663,13 @@ def get_attachments(xml):
 
 
 def download_attachments(output_path, urls):
-    """Downloads WordPress attachments and returns a list of paths to
-    attachments that can be associated with a post (relative path to output
-    directory). Files that fail to download, will not be added to posts"""
-    locations = []
+    """Downloads WordPress attachments and returns a returns a dict {url:path} of
+        attachments that can be associated with a post (relative path to output
+        directory). Files that fail to download, will not be added to posts
+
+    {relpath: {set of urls}}
+    """
+    locations = collections.defaultdict(set)
     for url in urls:
         path = urlparse(url).path
         # teardown path and rebuild to negate any errors with
@@ -671,7 +686,8 @@ def download_attachments(output_path, urls):
         print('downloading {}'.format(filename))
         try:
             urlretrieve(url, os.path.join(full_path, filename))
-            locations.append(os.path.join(localpath, filename))
+            relpath = os.path.join(localpath, filename)
+            locations[relpath].add(url)
         except (URLError, IOError) as e:
             # Python 2.7 throws an IOError rather Than URLError
             logger.warning("No file could be downloaded from %s\n%s", url, e)
@@ -720,7 +736,7 @@ def fields2pelican(
                 # Replace newlines with paragraphs wrapped with <p> so
                 # HTML is valid before conversion
                 if in_markup == 'wp-html':
-                    new_content = decode_wp_content(content)
+                    new_content = decode_wp_content(content, attached_files)
                 else:
                     paragraphs = content.splitlines()
                     paragraphs = ['<p>{0}</p>'.format(p) for p in paragraphs]

From c776cb2234817ae76d659329a245f7b36a1146f4 Mon Sep 17 00:00:00 2001
From: Stuart Axon <stuaxo2@yahoo.com>
Date: Fri, 13 Oct 2017 01:05:22 +0100
Subject: [PATCH 2/5] Process URLs after wordpress XML has been converted
 output format Don't download files more than once Handle post names that are
 just spaces

---
 pelican/tests/test_importer.py  |  6 +----
 pelican/tools/pelican_import.py | 42 ++++++++++++++++-----------------
 2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py
index 63d17539..6af59212 100644
--- a/pelican/tests/test_importer.py
+++ b/pelican/tests/test_importer.py
@@ -234,11 +234,7 @@ class TestWordpressXmlImporter(unittest.TestCase):
             with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file:
                 decoded_content = decoded_file.read()
                 self.assertEqual(
-                    decode_wp_content(
-                      encoded_content,
-                      attached_files=None,
-                      br=False
-                    ),
+                    decode_wp_content(encoded_content, br=False),
                     decoded_content)
 
     def test_preserve_verbatim_formatting(self):
diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py
index 26df1a4d..fa5d85b9 100755
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@@ -30,22 +30,11 @@ except ImportError:
 logger = logging.getLogger(__name__)
 
 
-def decode_wp_content(content, attached_files=None, br=True):
+def decode_wp_content(content, br=True):
     pre_tags = {}
     if content.strip() == "":
         return ""
 
-    if attached_files:
-        for path, urls in attached_files.items():
-            for url in urls:
-                content = re.sub(r'(<a\s+[^>]*href=")%s(")' % url,
-                                 r'\1/%s\2' % path,
-                                 content)
-
-                content = re.sub(r'(<img\s+[^>]*src=")%s(")' % url,
-                                 r'\1/%s\2' % path,
-                                 content)
-
     content += "\n"
     if "<pre" in content:
         pre_parts = content.split("</pre>")
@@ -144,9 +133,9 @@ def get_items(xml):
     return items
 
 
-def get_filename(filename, post_id):
-    if filename is not None:
-        return filename
+def get_filename(post_name, post_id):
+    if post_name and not post_name.isspace():
+        return post_name
     else:
         return post_id
 
@@ -166,9 +155,9 @@ def wp2fields(xml, wp_custpost=False):
                 title = 'No title [%s]' % item.find('post_name').string
                 logger.warning('Post "%s" is lacking a proper title', title)
 
-            filename = item.find('post_name').string
+            post_name = item.find('post_name').string
             post_id = item.find('post_id').string
-            filename = get_filename(filename, post_id)
+            filename = get_filename(post_name, post_id)
 
             content = item.find('encoded').string
             raw_date = item.find('post_date').string
@@ -637,14 +626,14 @@ def get_attachments(xml):
 
     for item in items:
         kind = item.find('post_type').string
-        filename = item.find('post_name').string
+        post_name = item.find('post_name').string
         post_id = item.find('post_id').string
 
         if kind == 'attachment':
             attachments.append((item.find('post_parent').string,
                                 item.find('attachment_url').string))
         else:
-            filename = get_filename(filename, post_id)
+            filename = get_filename(post_name, post_id)
             names[post_id] = filename
     attachedposts = {}
     for parent, url in attachments:
@@ -683,10 +672,13 @@ def download_attachments(output_path, urls):
         full_path = os.path.join(output_path, localpath)
         if not os.path.exists(full_path):
             os.makedirs(full_path)
+        relpath = os.path.join(localpath, filename)
+        if url in locations[relpath]:
+            continue
+
         print('downloading {}'.format(filename))
         try:
             urlretrieve(url, os.path.join(full_path, filename))
-            relpath = os.path.join(localpath, filename)
             locations[relpath].add(url)
         except (URLError, IOError) as e:
             # Python 2.7 throws an IOError rather Than URLError
@@ -736,7 +728,7 @@ def fields2pelican(
                 # Replace newlines with paragraphs wrapped with <p> so
                 # HTML is valid before conversion
                 if in_markup == 'wp-html':
-                    new_content = decode_wp_content(content, attached_files)
+                    new_content = decode_wp_content(content)
                 else:
                     paragraphs = content.splitlines()
                     paragraphs = ['<p>{0}</p>'.format(p) for p in paragraphs]
@@ -767,6 +759,14 @@ def fields2pelican(
 
             with open(out_filename, 'r', encoding='utf-8') as fs:
                 content = fs.read()
+
+                if attached_files:
+                    for path, urls in attached_files.items():
+                        for url in urls:
+                            content = re.sub(url,
+                                             r'{filename}/%s' % path,
+                                             content)
+
                 if out_markup == 'markdown':
                     # In markdown, to insert a <br />, end a line with two
                     # or more spaces & then a end-of-line

From 310ae7edeea19a051f112b85a781cb21f3c8627b Mon Sep 17 00:00:00 2001
From: Stuart Axon <stuaxo2@yahoo.com>
Date: Sat, 14 Oct 2017 17:35:51 +0100
Subject: [PATCH 3/5] Update pelican_import.py

Link to ```{filename}/path/to/file```
---
 pelican/tools/pelican_import.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py
index fa5d85b9..a8e323fb 100755
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@@ -764,7 +764,7 @@ def fields2pelican(
                     for path, urls in attached_files.items():
                         for url in urls:
                             content = re.sub(url,
-                                             r'{filename}/%s' % path,
+                                             r'/{filename}/%s' % path,
                                              content)
 
                 if out_markup == 'markdown':

From ea1104061a33b5c804b89958f5025ed4000d369f Mon Sep 17 00:00:00 2001
From: Stuart Axon <stuaxo2@yahoo.com>
Date: Sat, 14 Oct 2017 18:24:43 +0100
Subject: [PATCH 4/5] Update test_importer.py

Try fix for download_attachments test
---
 pelican/tests/test_importer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py
index 6af59212..f2c704c3 100644
--- a/pelican/tests/test_importer.py
+++ b/pelican/tests/test_importer.py
@@ -378,7 +378,7 @@ class TestWordpressXMLAttachements(unittest.TestCase):
         bad_url = 'http://localhost:1/not_a_file.txt'
         silent_da = mute()(download_attachments)
         with temporary_folder() as temp:
-            locations = list(silent_da(temp, [good_url, bad_url]))
+            locations = list(silent_da(temp, [good_url, bad_url]).keys())
             self.assertEqual(1, len(locations))
             directory = locations[0]
             self.assertTrue(

From 2ece08afafdcb3d246a8c1596f9aeafeed9fa7bf Mon Sep 17 00:00:00 2001
From: Stuart Axon <stuaxo2@yahoo.com>
Date: Sat, 14 Oct 2017 19:46:16 +0100
Subject: [PATCH 5/5] Update pelican_import.py

Don't create empty sets in download_attachments, this might fix an issue that came in the tests.
---
 pelican/tools/pelican_import.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py
index a8e323fb..6fddb2be 100755
--- a/pelican/tools/pelican_import.py
+++ b/pelican/tools/pelican_import.py
@@ -673,7 +673,7 @@ def download_attachments(output_path, urls):
         if not os.path.exists(full_path):
             os.makedirs(full_path)
         relpath = os.path.join(localpath, filename)
-        if url in locations[relpath]:
+        if relpath in locations and url in locations[relpath]:
             continue
 
         print('downloading {}'.format(filename))