Import wordpress pages to pages/ subdir with --dir-page option

When importing from Wordpress, the --dir-page directive (disabled by
default) automatically adds files to the pages/ when they are recognised
as pages, as opposed to posts.
This commit is contained in:
James Murty 2013-04-19 23:06:59 +01:00
commit 8c7ea8df98
3 changed files with 54 additions and 9 deletions

View file

@ -42,7 +42,7 @@ Usage
::
pelican-import [-h] [--wpfile] [--dotclear] [--posterous] [--feed] [-o OUTPUT]
[-m MARKUP] [--dir-cat] [--strip-raw] [--disable-slugs]
[-m MARKUP] [--dir-cat] [--dir-page] [--strip-raw] [--disable-slugs]
[-e EMAIL] [-p PASSWORD]
input|api_token
@ -67,6 +67,8 @@ Optional arguments
(default: rst)
--dir-cat Put files in directories with categories name
(default: False)
--dir-page Put files recognised as pages in "pages/" sub-
directory (wordpress import only) (default: False)
--strip-raw Strip raw HTML code that can't be converted to markup
such as flash embeds or iframes (wordpress import
only) (default: False)

View file

@ -32,9 +32,28 @@ class TestWordpressXmlImporter(unittest.TestCase):
def test_ignore_empty_posts(self):
self.assertTrue(self.posts)
for title, content, fname, date, author, categ, tags, format in self.posts:
for title, content, fname, date, author, categ, tags, kind, format in self.posts:
self.assertTrue(title.strip())
def test_recognise_page_kind(self):
""" Check that we recognise pages in wordpress, as opposed to posts """
self.assertTrue(self.posts)
# Collect (title, filename, kind) of non-empty posts recognised as page
pages_data = []
for title, content, fname, date, author, categ, tags, kind, format in self.posts:
if kind == 'page':
pages_data.append((title, fname))
self.assertEqual(2, len(pages_data))
self.assertEqual(('Page', 'contact'), pages_data[0])
self.assertEqual(('Empty Page', 'empty'), pages_data[1])
def test_dirpage_directive_for_page_kind(self):
silent_f2p = mute(True)(fields2pelican)
test_post = filter(lambda p: p[0].startswith("Empty Page"), self.posts)
with temporary_folder() as temp:
fname = list(silent_f2p(test_post, 'markdown', temp, dirpage=True))[0]
self.assertTrue(fname.endswith('pages%sempty.md' % os.path.sep))
def test_can_toggle_raw_html_code_parsing(self):
def r(f):
with open(f) as infile:

View file

@ -136,7 +136,12 @@ def wp2fields(xml):
tags = [tag.string for tag in item.findAll('category', {'domain' : 'post_tag'})]
yield (title, content, filename, date, author, categories, tags, "wp-html")
kind = 'article'
if item.find('post_type').string == 'page':
kind = 'page'
yield (title, content, filename, date, author, categories, tags,
kind, "wp-html")
def dc2fields(file):
"""Opens a Dotclear export file, and yield pelican fields"""
@ -265,7 +270,10 @@ def dc2fields(file):
content = content.replace('\\n', '')
post_format = "html"
yield (post_title, content, slugify(post_title), post_creadt, author, categories, tags, post_format)
kind = 'article' # TODO: Recognise pages
yield (post_title, content, slugify(post_title), post_creadt, author,
categories, tags, kind, post_format)
def posterous2fields(api_token, email, password):
@ -313,9 +321,10 @@ def posterous2fields(api_token, email, password):
delta = timedelta(hours = offset / 100)
date_object -= delta
date = date_object.strftime("%Y-%m-%d %H:%M")
kind = 'article' # TODO: Recognise pages
yield (post.get('title'), post.get('body_cleaned'), slug, date,
post.get('user').get('display_name'), [], tags, "html")
post.get('user').get('display_name'), [], tags, kind, "html")
def feed2fields(file):
"""Read a feed and yield pelican fields"""
@ -328,7 +337,9 @@ def feed2fields(file):
tags = [e['term'] for e in entry.tags] if hasattr(entry, "tags") else None
slug = slugify(entry.title)
yield (entry.title, entry.description, slug, date, author, [], tags, "html")
kind = 'article'
yield (entry.title, entry.description, slug, date, author, [], tags,
kind, "html")
def build_header(title, date, author, categories, tags, slug):
@ -363,8 +374,11 @@ def build_markdown_header(title, date, author, categories, tags, slug):
header += '\n'
return header
def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=False, disable_slugs=False):
for title, content, filename, date, author, categories, tags, in_markup in fields:
def fields2pelican(fields, out_markup, output_path,
dircat=False, strip_raw=False, disable_slugs=False,
dirpage=False, filename_template=None):
for (title, content, filename, date, author, categories, tags,
kind, in_markup) in fields:
slug = not disable_slugs and filename or None
if (in_markup == "markdown") or (out_markup == "markdown") :
ext = '.md'
@ -385,8 +399,14 @@ def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=Fals
filename = '_'
filename = filename[:249] # allow for 5 extra characters
# option to put page posts in pages/ subdirectory
if dirpage and kind == 'page':
pages_dir = os.path.join(output_path, 'pages')
if not os.path.isdir(pages_dir):
os.mkdir(pages_dir)
out_filename = os.path.join(pages_dir, filename+ext)
# option to put files in directories with categories names
if dircat and (len(categories) > 0):
elif dircat and (len(categories) > 0):
catname = slugify(categories[0])
out_filename = os.path.join(output_path, catname, filename+ext)
if not os.path.isdir(os.path.join(output_path, catname)):
@ -464,6 +484,9 @@ def main():
help='Output markup format (supports rst & markdown)')
parser.add_argument('--dir-cat', action='store_true', dest='dircat',
help='Put files in directories with categories name')
parser.add_argument('--dir-page', action='store_true', dest='dirpage',
help=('Put files recognised as pages in "pages/" sub-directory'
' (wordpress import only)'))
parser.add_argument('--strip-raw', action='store_true', dest='strip_raw',
help="Strip raw HTML code that can't be converted to "
"markup such as flash embeds or iframes (wordpress import only)")
@ -512,5 +535,6 @@ def main():
fields2pelican(fields, args.markup, args.output,
dircat=args.dircat or False,
dirpage=args.dirpage or False,
strip_raw=args.strip_raw or False,
disable_slugs=args.disable_slugs or False)