Process URLs after wordpress XML has been converted output format

Don't download files more than once
Handle post names that are just spaces
This commit is contained in:
Stuart Axon 2017-10-13 01:05:22 +01:00
commit c776cb2234
2 changed files with 22 additions and 26 deletions

View file

@ -234,11 +234,7 @@ class TestWordpressXmlImporter(unittest.TestCase):
with open(WORDPRESS_DECODED_CONTENT_SAMPLE, 'r') as decoded_file:
decoded_content = decoded_file.read()
self.assertEqual(
decode_wp_content(
encoded_content,
attached_files=None,
br=False
),
decode_wp_content(encoded_content, br=False),
decoded_content)
def test_preserve_verbatim_formatting(self):

View file

@ -30,22 +30,11 @@ except ImportError:
logger = logging.getLogger(__name__)
def decode_wp_content(content, attached_files=None, br=True):
def decode_wp_content(content, br=True):
pre_tags = {}
if content.strip() == "":
return ""
if attached_files:
for path, urls in attached_files.items():
for url in urls:
content = re.sub(r'(<a\s+[^>]*href=")%s(")' % url,
r'\1/%s\2' % path,
content)
content = re.sub(r'(<img\s+[^>]*src=")%s(")' % url,
r'\1/%s\2' % path,
content)
content += "\n"
if "<pre" in content:
pre_parts = content.split("</pre>")
@ -144,9 +133,9 @@ def get_items(xml):
return items
def get_filename(filename, post_id):
if filename is not None:
return filename
def get_filename(post_name, post_id):
if post_name and not post_name.isspace():
return post_name
else:
return post_id
@ -166,9 +155,9 @@ def wp2fields(xml, wp_custpost=False):
title = 'No title [%s]' % item.find('post_name').string
logger.warning('Post "%s" is lacking a proper title', title)
filename = item.find('post_name').string
post_name = item.find('post_name').string
post_id = item.find('post_id').string
filename = get_filename(filename, post_id)
filename = get_filename(post_name, post_id)
content = item.find('encoded').string
raw_date = item.find('post_date').string
@ -637,14 +626,14 @@ def get_attachments(xml):
for item in items:
kind = item.find('post_type').string
filename = item.find('post_name').string
post_name = item.find('post_name').string
post_id = item.find('post_id').string
if kind == 'attachment':
attachments.append((item.find('post_parent').string,
item.find('attachment_url').string))
else:
filename = get_filename(filename, post_id)
filename = get_filename(post_name, post_id)
names[post_id] = filename
attachedposts = {}
for parent, url in attachments:
@ -683,10 +672,13 @@ def download_attachments(output_path, urls):
full_path = os.path.join(output_path, localpath)
if not os.path.exists(full_path):
os.makedirs(full_path)
relpath = os.path.join(localpath, filename)
if url in locations[relpath]:
continue
print('downloading {}'.format(filename))
try:
urlretrieve(url, os.path.join(full_path, filename))
relpath = os.path.join(localpath, filename)
locations[relpath].add(url)
except (URLError, IOError) as e:
# Python 2.7 throws an IOError rather Than URLError
@ -736,7 +728,7 @@ def fields2pelican(
# Replace newlines with paragraphs wrapped with <p> so
# HTML is valid before conversion
if in_markup == 'wp-html':
new_content = decode_wp_content(content, attached_files)
new_content = decode_wp_content(content)
else:
paragraphs = content.splitlines()
paragraphs = ['<p>{0}</p>'.format(p) for p in paragraphs]
@ -767,6 +759,14 @@ def fields2pelican(
with open(out_filename, 'r', encoding='utf-8') as fs:
content = fs.read()
if attached_files:
for path, urls in attached_files.items():
for url in urls:
content = re.sub(url,
r'{filename}/%s' % path,
content)
if out_markup == 'markdown':
# In markdown, to insert a <br />, end a line with two
# or more spaces & then a end-of-line