pelican-import, fix urls of attached files

This patch wp-attach modifies ```<img``` and  ```<a href``` urls to point at the attachment path - so  ```http://example.com/wp-content/uploads/blah.png```   becomes    ```wp-content/uploads/blah.png```

This needs wp-content in pelicanconf STATIC_FILES

I thought I could use {filename} but since the importer is generates source html this doesn't work.
It would be nice if there was a way to customise the output path, but not sure how that would work.
This commit is contained in:
Stuart Axon 2017-10-12 00:45:59 +01:00
commit fe2eb86faf
2 changed files with 28 additions and 8 deletions

View file

@ -234,7 +234,11 @@ class TestWordpressXmlImporter(unittest.TestCase):
with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file: with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file:
decoded_content = decoded_file.read() decoded_content = decoded_file.read()
self.assertEqual( self.assertEqual(
decode_wp_content(encoded_content, br=False), decode_wp_content(
encoded_content,
attached_files=None,
br=False
),
decoded_content) decoded_content)
def test_preserve_verbatim_formatting(self): def test_preserve_verbatim_formatting(self):

View file

@ -3,6 +3,7 @@
from __future__ import print_function, unicode_literals from __future__ import print_function, unicode_literals
import argparse import argparse
import collections
import logging import logging
import os import os
import re import re
@ -29,11 +30,22 @@ except ImportError:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def decode_wp_content(content, br=True): def decode_wp_content(content, attached_files=None, br=True):
pre_tags = {} pre_tags = {}
if content.strip() == "": if content.strip() == "":
return "" return ""
if attached_files:
for path, urls in attached_files.items():
for url in urls:
content = re.sub(r'(<a\s+[^>]*href=")%s(")' % url,
r'\1/%s\2' % path,
content)
content = re.sub(r'(<img\s+[^>]*src=")%s(")' % url,
r'\1/%s\2' % path,
content)
content += "\n" content += "\n"
if "<pre" in content: if "<pre" in content:
pre_parts = content.split("</pre>") pre_parts = content.split("</pre>")
@ -651,10 +663,13 @@ def get_attachments(xml):
def download_attachments(output_path, urls): def download_attachments(output_path, urls):
"""Downloads WordPress attachments and returns a list of paths to """Downloads WordPress attachments and returns a returns a dict {url:path} of
attachments that can be associated with a post (relative path to output attachments that can be associated with a post (relative path to output
directory). Files that fail to download, will not be added to posts""" directory). Files that fail to download, will not be added to posts
locations = []
{relpath: {set of urls}}
"""
locations = collections.defaultdict(set)
for url in urls: for url in urls:
path = urlparse(url).path path = urlparse(url).path
# teardown path and rebuild to negate any errors with # teardown path and rebuild to negate any errors with
@ -671,7 +686,8 @@ def download_attachments(output_path, urls):
print('downloading {}'.format(filename)) print('downloading {}'.format(filename))
try: try:
urlretrieve(url, os.path.join(full_path, filename)) urlretrieve(url, os.path.join(full_path, filename))
locations.append(os.path.join(localpath, filename)) relpath = os.path.join(localpath, filename)
locations[relpath].add(url)
except (URLError, IOError) as e: except (URLError, IOError) as e:
# Python 2.7 throws an IOError rather Than URLError # Python 2.7 throws an IOError rather Than URLError
logger.warning("No file could be downloaded from %s\n%s", url, e) logger.warning("No file could be downloaded from %s\n%s", url, e)
@ -720,7 +736,7 @@ def fields2pelican(
# Replace newlines with paragraphs wrapped with <p> so # Replace newlines with paragraphs wrapped with <p> so
# HTML is valid before conversion # HTML is valid before conversion
if in_markup == 'wp-html': if in_markup == 'wp-html':
new_content = decode_wp_content(content) new_content = decode_wp_content(content, attached_files)
else: else:
paragraphs = content.splitlines() paragraphs = content.splitlines()
paragraphs = ['<p>{0}</p>'.format(p) for p in paragraphs] paragraphs = ['<p>{0}</p>'.format(p) for p in paragraphs]