mirror of
https://github.com/getpelican/pelican.git
synced 2025-10-15 20:28:56 +02:00
pelican-import, fix urls of attached files
This patch wp-attach modifies ```<img``` and ```<a href``` urls to point at the attachment path - so ```http://example.com/wp-content/uploads/blah.png``` becomes ```wp-content/uploads/blah.png``` This needs wp-content in pelicanconf STATIC_FILES I thought I could use {filename} but since the importer is generates source html this doesn't work. It would be nice if there was a way to customise the output path, but not sure how that would work.
This commit is contained in:
parent
359ffcabb8
commit
fe2eb86faf
2 changed files with 28 additions and 8 deletions
|
|
@ -234,7 +234,11 @@ class TestWordpressXmlImporter(unittest.TestCase):
|
||||||
with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file:
|
with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file:
|
||||||
decoded_content = decoded_file.read()
|
decoded_content = decoded_file.read()
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decode_wp_content(encoded_content, br=False),
|
decode_wp_content(
|
||||||
|
encoded_content,
|
||||||
|
attached_files=None,
|
||||||
|
br=False
|
||||||
|
),
|
||||||
decoded_content)
|
decoded_content)
|
||||||
|
|
||||||
def test_preserve_verbatim_formatting(self):
|
def test_preserve_verbatim_formatting(self):
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
from __future__ import print_function, unicode_literals
|
from __future__ import print_function, unicode_literals
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import collections
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
@ -29,11 +30,22 @@ except ImportError:
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def decode_wp_content(content, br=True):
|
def decode_wp_content(content, attached_files=None, br=True):
|
||||||
pre_tags = {}
|
pre_tags = {}
|
||||||
if content.strip() == "":
|
if content.strip() == "":
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
if attached_files:
|
||||||
|
for path, urls in attached_files.items():
|
||||||
|
for url in urls:
|
||||||
|
content = re.sub(r'(<a\s+[^>]*href=")%s(")' % url,
|
||||||
|
r'\1/%s\2' % path,
|
||||||
|
content)
|
||||||
|
|
||||||
|
content = re.sub(r'(<img\s+[^>]*src=")%s(")' % url,
|
||||||
|
r'\1/%s\2' % path,
|
||||||
|
content)
|
||||||
|
|
||||||
content += "\n"
|
content += "\n"
|
||||||
if "<pre" in content:
|
if "<pre" in content:
|
||||||
pre_parts = content.split("</pre>")
|
pre_parts = content.split("</pre>")
|
||||||
|
|
@ -651,10 +663,13 @@ def get_attachments(xml):
|
||||||
|
|
||||||
|
|
||||||
def download_attachments(output_path, urls):
|
def download_attachments(output_path, urls):
|
||||||
"""Downloads WordPress attachments and returns a list of paths to
|
"""Downloads WordPress attachments and returns a returns a dict {url:path} of
|
||||||
attachments that can be associated with a post (relative path to output
|
attachments that can be associated with a post (relative path to output
|
||||||
directory). Files that fail to download, will not be added to posts"""
|
directory). Files that fail to download, will not be added to posts
|
||||||
locations = []
|
|
||||||
|
{relpath: {set of urls}}
|
||||||
|
"""
|
||||||
|
locations = collections.defaultdict(set)
|
||||||
for url in urls:
|
for url in urls:
|
||||||
path = urlparse(url).path
|
path = urlparse(url).path
|
||||||
# teardown path and rebuild to negate any errors with
|
# teardown path and rebuild to negate any errors with
|
||||||
|
|
@ -671,7 +686,8 @@ def download_attachments(output_path, urls):
|
||||||
print('downloading {}'.format(filename))
|
print('downloading {}'.format(filename))
|
||||||
try:
|
try:
|
||||||
urlretrieve(url, os.path.join(full_path, filename))
|
urlretrieve(url, os.path.join(full_path, filename))
|
||||||
locations.append(os.path.join(localpath, filename))
|
relpath = os.path.join(localpath, filename)
|
||||||
|
locations[relpath].add(url)
|
||||||
except (URLError, IOError) as e:
|
except (URLError, IOError) as e:
|
||||||
# Python 2.7 throws an IOError rather Than URLError
|
# Python 2.7 throws an IOError rather Than URLError
|
||||||
logger.warning("No file could be downloaded from %s\n%s", url, e)
|
logger.warning("No file could be downloaded from %s\n%s", url, e)
|
||||||
|
|
@ -720,7 +736,7 @@ def fields2pelican(
|
||||||
# Replace newlines with paragraphs wrapped with <p> so
|
# Replace newlines with paragraphs wrapped with <p> so
|
||||||
# HTML is valid before conversion
|
# HTML is valid before conversion
|
||||||
if in_markup == 'wp-html':
|
if in_markup == 'wp-html':
|
||||||
new_content = decode_wp_content(content)
|
new_content = decode_wp_content(content, attached_files)
|
||||||
else:
|
else:
|
||||||
paragraphs = content.splitlines()
|
paragraphs = content.splitlines()
|
||||||
paragraphs = ['<p>{0}</p>'.format(p) for p in paragraphs]
|
paragraphs = ['<p>{0}</p>'.format(p) for p in paragraphs]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue