importer: update links to attachments if --wp-attach

This commit is contained in:
Oliver Urs Lenz 2018-07-09 11:26:50 +02:00
commit a5571ba1d5
2 changed files with 27 additions and 15 deletions

View file

@ -80,10 +80,11 @@ Optional arguments
as "/post_type/category/" (wordpress import only) as "/post_type/category/" (wordpress import only)
--wp-attach Download files uploaded to wordpress as attachments. --wp-attach Download files uploaded to wordpress as attachments.
Files will be added to posts as a list in the post Files will be added to posts as a list in the post
header. All files will be downloaded, even if they header and links to the files within the post will be
updated. All files will be downloaded, even if they
aren't associated with a post. Files will be downloaded aren't associated with a post. Files will be downloaded
with their original path inside the output directory, with their original path inside the output directory,
e.g. "output/wp-uploads/date/postname/file.jpg" e.g. "output/wp-uploads/date/postname/file.jpg".
(wordpress import only) (requires an internet (wordpress import only) (requires an internet
connection) connection)
--disable-slugs Disable storing slugs from imported posts within --disable-slugs Disable storing slugs from imported posts within

View file

@ -10,6 +10,7 @@ import subprocess
import sys import sys
import time import time
from codecs import open from codecs import open
from collections import defaultdict
from six.moves.urllib.error import URLError from six.moves.urllib.error import URLError
from six.moves.urllib.parse import urlparse from six.moves.urllib.parse import urlparse
@ -633,7 +634,7 @@ def get_attachments(xml):
else: else:
filename = get_filename(filename, post_id) filename = get_filename(filename, post_id)
names[post_id] = filename names[post_id] = filename
attachedposts = {} attachedposts = defaultdict(list)
for parent, url in attachments: for parent, url in attachments:
try: try:
parent_name = names[parent] parent_name = names[parent]
@ -641,11 +642,7 @@ def get_attachments(xml):
# attachment's parent is not a valid post # attachment's parent is not a valid post
parent_name = None parent_name = None
try: attachedposts[parent_name].append(url)
attachedposts[parent_name].append(url)
except KeyError:
attachedposts[parent_name] = []
attachedposts[parent_name].append(url)
return attachedposts return attachedposts
@ -653,7 +650,7 @@ def download_attachments(output_path, urls):
"""Downloads WordPress attachments and returns a list of paths to """Downloads WordPress attachments and returns a list of paths to
attachments that can be associated with a post (relative path to output attachments that can be associated with a post (relative path to output
directory). Files that fail to download, will not be added to posts""" directory). Files that fail to download, will not be added to posts"""
locations = [] locations = {}
for url in urls: for url in urls:
path = urlparse(url).path path = urlparse(url).path
# teardown path and rebuild to negate any errors with # teardown path and rebuild to negate any errors with
@ -670,13 +667,23 @@ def download_attachments(output_path, urls):
print('downloading {}'.format(filename)) print('downloading {}'.format(filename))
try: try:
urlretrieve(url, os.path.join(full_path, filename)) urlretrieve(url, os.path.join(full_path, filename))
locations.append(os.path.join(localpath, filename)) locations[url] = os.path.join(localpath, filename)
except (URLError, IOError) as e: except (URLError, IOError) as e:
# Python 2.7 throws an IOError rather Than URLError # Python 2.7 throws an IOError rather Than URLError
logger.warning("No file could be downloaded from %s\n%s", url, e) logger.warning("No file could be downloaded from %s\n%s", url, e)
return locations return locations
def update_links_to_attached_files(content, attachments):
for old_url, new_path in attachments.items():
# url may occur both with http:// and https://
http_url = old_url.replace('https://', 'http://')
https_url = old_url.replace('http://', 'https://')
for url in [http_url, https_url]:
content = content.replace(url, '{filename}' + new_path)
return content
def fields2pelican( def fields2pelican(
fields, out_markup, output_path, fields, out_markup, output_path,
dircat=False, strip_raw=False, disable_slugs=False, dircat=False, strip_raw=False, disable_slugs=False,
@ -691,21 +698,22 @@ def fields2pelican(
if wp_attach and attachments: if wp_attach and attachments:
try: try:
urls = attachments[filename] urls = attachments[filename]
attached_files = download_attachments(output_path, urls) links = download_attachments(output_path, urls)
except KeyError: except KeyError:
attached_files = None links = None
else: else:
attached_files = None links = None
ext = get_ext(out_markup, in_markup) ext = get_ext(out_markup, in_markup)
if ext == '.md': if ext == '.md':
header = build_markdown_header( header = build_markdown_header(
title, date, author, categories, tags, slug, title, date, author, categories, tags, slug,
status, attached_files) status, links.values() if links else None)
else: else:
out_markup = 'rst' out_markup = 'rst'
header = build_header(title, date, author, categories, header = build_header(title, date, author, categories,
tags, slug, status, attached_files) tags, slug, status, links.values()
if links else None)
out_filename = get_out_filename( out_filename = get_out_filename(
output_path, filename, ext, kind, dirpage, dircat, output_path, filename, ext, kind, dirpage, dircat,
@ -756,6 +764,9 @@ def fields2pelican(
content = content.replace('\\\n ', ' \n') content = content.replace('\\\n ', ' \n')
content = content.replace('\\\n', ' \n') content = content.replace('\\\n', ' \n')
if wp_attach and links:
content = update_links_to_attached_files(content, links)
with open(out_filename, 'w', encoding='utf-8') as fs: with open(out_filename, 'w', encoding='utf-8') as fs:
fs.write(header + content) fs.write(header + content)
if wp_attach and attachments and None in attachments: if wp_attach and attachments and None in attachments: