From 8c7ea8df981a7afbffdd2b5df136c7ccb7dbe3b3 Mon Sep 17 00:00:00 2001 From: James Murty Date: Fri, 19 Apr 2013 23:06:59 +0100 Subject: [PATCH] Import wordpress pages to pages/ subdir with --dir-page option When importing from Wordpress, the --dir-page directive (disabled by default) automatically adds files to the pages/ when they are recognised as pages, as opposed to posts. --- docs/importer.rst | 4 +++- pelican/tests/test_importer.py | 21 +++++++++++++++++- pelican/tools/pelican_import.py | 38 +++++++++++++++++++++++++++------ 3 files changed, 54 insertions(+), 9 deletions(-) diff --git a/docs/importer.rst b/docs/importer.rst index bb24b4b9..86b67767 100644 --- a/docs/importer.rst +++ b/docs/importer.rst @@ -42,7 +42,7 @@ Usage :: pelican-import [-h] [--wpfile] [--dotclear] [--posterous] [--feed] [-o OUTPUT] - [-m MARKUP] [--dir-cat] [--strip-raw] [--disable-slugs] + [-m MARKUP] [--dir-cat] [--dir-page] [--strip-raw] [--disable-slugs] [-e EMAIL] [-p PASSWORD] input|api_token @@ -67,6 +67,8 @@ Optional arguments (default: rst) --dir-cat Put files in directories with categories name (default: False) + --dir-page Put files recognised as pages in "pages/" sub- + directory (wordpress import only) (default: False) --strip-raw Strip raw HTML code that can't be converted to markup such as flash embeds or iframes (wordpress import only) (default: False) diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index c18f447f..cb095426 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -32,9 +32,28 @@ class TestWordpressXmlImporter(unittest.TestCase): def test_ignore_empty_posts(self): self.assertTrue(self.posts) - for title, content, fname, date, author, categ, tags, format in self.posts: + for title, content, fname, date, author, categ, tags, kind, format in self.posts: self.assertTrue(title.strip()) + def test_recognise_page_kind(self): + """ Check that we recognise pages in wordpress, as opposed to posts """ + self.assertTrue(self.posts) + # Collect (title, filename, kind) of non-empty posts recognised as page + pages_data = [] + for title, content, fname, date, author, categ, tags, kind, format in self.posts: + if kind == 'page': + pages_data.append((title, fname)) + self.assertEqual(2, len(pages_data)) + self.assertEqual(('Page', 'contact'), pages_data[0]) + self.assertEqual(('Empty Page', 'empty'), pages_data[1]) + + def test_dirpage_directive_for_page_kind(self): + silent_f2p = mute(True)(fields2pelican) + test_post = filter(lambda p: p[0].startswith("Empty Page"), self.posts) + with temporary_folder() as temp: + fname = list(silent_f2p(test_post, 'markdown', temp, dirpage=True))[0] + self.assertTrue(fname.endswith('pages%sempty.md' % os.path.sep)) + def test_can_toggle_raw_html_code_parsing(self): def r(f): with open(f) as infile: diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 9e477c2c..5d0937fd 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -136,7 +136,12 @@ def wp2fields(xml): tags = [tag.string for tag in item.findAll('category', {'domain' : 'post_tag'})] - yield (title, content, filename, date, author, categories, tags, "wp-html") + kind = 'article' + if item.find('post_type').string == 'page': + kind = 'page' + + yield (title, content, filename, date, author, categories, tags, + kind, "wp-html") def dc2fields(file): """Opens a Dotclear export file, and yield pelican fields""" @@ -265,7 +270,10 @@ def dc2fields(file): content = content.replace('\\n', '') post_format = "html" - yield (post_title, content, slugify(post_title), post_creadt, author, categories, tags, post_format) + kind = 'article' # TODO: Recognise pages + + yield (post_title, content, slugify(post_title), post_creadt, author, + categories, tags, kind, post_format) def posterous2fields(api_token, email, password): @@ -313,9 +321,10 @@ def posterous2fields(api_token, email, password): delta = timedelta(hours = offset / 100) date_object -= delta date = date_object.strftime("%Y-%m-%d %H:%M") + kind = 'article' # TODO: Recognise pages yield (post.get('title'), post.get('body_cleaned'), slug, date, - post.get('user').get('display_name'), [], tags, "html") + post.get('user').get('display_name'), [], tags, kind, "html") def feed2fields(file): """Read a feed and yield pelican fields""" @@ -328,7 +337,9 @@ def feed2fields(file): tags = [e['term'] for e in entry.tags] if hasattr(entry, "tags") else None slug = slugify(entry.title) - yield (entry.title, entry.description, slug, date, author, [], tags, "html") + kind = 'article' + yield (entry.title, entry.description, slug, date, author, [], tags, + kind, "html") def build_header(title, date, author, categories, tags, slug): @@ -363,8 +374,11 @@ def build_markdown_header(title, date, author, categories, tags, slug): header += '\n' return header -def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=False, disable_slugs=False): - for title, content, filename, date, author, categories, tags, in_markup in fields: +def fields2pelican(fields, out_markup, output_path, + dircat=False, strip_raw=False, disable_slugs=False, + dirpage=False, filename_template=None): + for (title, content, filename, date, author, categories, tags, + kind, in_markup) in fields: slug = not disable_slugs and filename or None if (in_markup == "markdown") or (out_markup == "markdown") : ext = '.md' @@ -385,8 +399,14 @@ def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=Fals filename = '_' filename = filename[:249] # allow for 5 extra characters + # option to put page posts in pages/ subdirectory + if dirpage and kind == 'page': + pages_dir = os.path.join(output_path, 'pages') + if not os.path.isdir(pages_dir): + os.mkdir(pages_dir) + out_filename = os.path.join(pages_dir, filename+ext) # option to put files in directories with categories names - if dircat and (len(categories) > 0): + elif dircat and (len(categories) > 0): catname = slugify(categories[0]) out_filename = os.path.join(output_path, catname, filename+ext) if not os.path.isdir(os.path.join(output_path, catname)): @@ -464,6 +484,9 @@ def main(): help='Output markup format (supports rst & markdown)') parser.add_argument('--dir-cat', action='store_true', dest='dircat', help='Put files in directories with categories name') + parser.add_argument('--dir-page', action='store_true', dest='dirpage', + help=('Put files recognised as pages in "pages/" sub-directory' + ' (wordpress import only)')) parser.add_argument('--strip-raw', action='store_true', dest='strip_raw', help="Strip raw HTML code that can't be converted to " "markup such as flash embeds or iframes (wordpress import only)") @@ -512,5 +535,6 @@ def main(): fields2pelican(fields, args.markup, args.output, dircat=args.dircat or False, + dirpage=args.dirpage or False, strip_raw=args.strip_raw or False, disable_slugs=args.disable_slugs or False)