Merge pull request #577 from davidjb/import-improvements-slug

Provide slug storage option for posts during Pelican import
This commit is contained in:
Alexis Metaireau 2012-12-11 03:52:12 -08:00
commit 98c8db568b
4 changed files with 64 additions and 16 deletions

View file

@ -9,6 +9,8 @@ Release history
3.1 (2012-12-04) 3.1 (2012-12-04)
================ ================
* Importer now stores slugs within files by default. This can be disabled with
the ``--disable-slugs`` option.
* Improve handling of links to intra-site resources * Improve handling of links to intra-site resources
* Ensure WordPress import adds paragraphs for all types of line endings * Ensure WordPress import adds paragraphs for all types of line endings
in post content in post content

View file

@ -39,29 +39,44 @@ Usage
""""" """""
| pelican-import [-h] [--wpfile] [--dotclear] [--feed] [-o OUTPUT] | pelican-import [-h] [--wpfile] [--dotclear] [--feed] [-o OUTPUT]
| [-m MARKUP][--dir-cat] | [-m MARKUP] [--dir-cat] [--strip-raw] [--disable-slugs]
| input | input
Positional arguments
====================
input The input file to read
Optional arguments Optional arguments
"""""""""""""""""" """"""""""""""""""
-h, --help show this help message and exit -h, --help show this help message and exit
--wpfile Wordpress XML export --wpfile Wordpress XML export (default: False)
--dotclear Dotclear export --dotclear Dotclear export (default: False)
--feed Feed to parse --feed Feed to parse (default: False)
-o OUTPUT, --output OUTPUT -o OUTPUT, --output OUTPUT
Output path Output path (default: output)
-m MARKUP Output markup -m MARKUP, --markup MARKUP
Output markup format (supports rst & markdown)
(default: rst)
--dir-cat Put files in directories with categories name --dir-cat Put files in directories with categories name
(default: False)
--strip-raw Strip raw HTML code that can't be converted to markup
such as flash embeds or iframes (wordpress import
only) (default: False)
--disable-slugs Disable storing slugs from imported posts within
output. With this disabled, your Pelican URLs may not
be consistent with your original posts. (default:
False)
Examples Examples
======== ========
for WordPress:: For WordPress::
$ pelican-import --wpfile -o ~/output ~/posts.xml $ pelican-import --wpfile -o ~/output ~/posts.xml
for Dotclear:: For Dotclear::
$ pelican-import --dotclear -o ~/output ~/backup.txt $ pelican-import --dotclear -o ~/output ~/backup.txt

View file

@ -181,7 +181,7 @@ def feed2fields(file):
yield (entry.title, entry.description, slug, date, author, [], tags, "html") yield (entry.title, entry.description, slug, date, author, [], tags, "html")
def build_header(title, date, author, categories, tags): def build_header(title, date, author, categories, tags, slug):
"""Build a header from a list of fields""" """Build a header from a list of fields"""
header = '%s\n%s\n' % (title, '#' * len(title)) header = '%s\n%s\n' % (title, '#' * len(title))
if date: if date:
@ -192,10 +192,12 @@ def build_header(title, date, author, categories, tags):
header += ':category: %s\n' % ', '.join(categories) header += ':category: %s\n' % ', '.join(categories)
if tags: if tags:
header += ':tags: %s\n' % ', '.join(tags) header += ':tags: %s\n' % ', '.join(tags)
if slug:
header += ':slug: %s\n' % slug
header += '\n' header += '\n'
return header return header
def build_markdown_header(title, date, author, categories, tags): def build_markdown_header(title, date, author, categories, tags, slug):
"""Build a header from a list of fields""" """Build a header from a list of fields"""
header = 'Title: %s\n' % title header = 'Title: %s\n' % title
if date: if date:
@ -206,18 +208,21 @@ def build_markdown_header(title, date, author, categories, tags):
header += 'Category: %s\n' % ', '.join(categories) header += 'Category: %s\n' % ', '.join(categories)
if tags: if tags:
header += 'Tags: %s\n' % ', '.join(tags) header += 'Tags: %s\n' % ', '.join(tags)
if slug:
header += 'Slug: %s\n' % slug
header += '\n' header += '\n'
return header return header
def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=False): def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=False, disable_slugs=False):
for title, content, filename, date, author, categories, tags, in_markup in fields: for title, content, filename, date, author, categories, tags, in_markup in fields:
slug = not disable_slugs and filename or None
if (in_markup == "markdown") or (out_markup == "markdown") : if (in_markup == "markdown") or (out_markup == "markdown") :
ext = '.md' ext = '.md'
header = build_markdown_header(title, date, author, categories, tags) header = build_markdown_header(title, date, author, categories, tags, slug)
else: else:
out_markup = "rst" out_markup = "rst"
ext = '.rst' ext = '.rst'
header = build_header(title, date, author, categories, tags) header = build_header(title, date, author, categories, tags, slug)
filename = os.path.basename(filename) filename = os.path.basename(filename)
@ -278,8 +283,8 @@ def fields2pelican(fields, out_markup, output_path, dircat=False, strip_raw=Fals
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Transform feed, Wordpress or Dotclear files to rst files." description="Transform feed, Wordpress or Dotclear files to reST (rst) "
"Be sure to have pandoc installed", "or Markdown (md) files. Be sure to have pandoc installed.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(dest='input', help='The input file to read') parser.add_argument(dest='input', help='The input file to read')
@ -298,6 +303,11 @@ def main():
parser.add_argument('--strip-raw', action='store_true', dest='strip_raw', parser.add_argument('--strip-raw', action='store_true', dest='strip_raw',
help="Strip raw HTML code that can't be converted to " help="Strip raw HTML code that can't be converted to "
"markup such as flash embeds or iframes (wordpress import only)") "markup such as flash embeds or iframes (wordpress import only)")
parser.add_argument('--disable-slugs', action='store_true',
dest='disable_slugs',
help='Disable storing slugs from imported posts within output. '
'With this disabled, your Pelican URLs may not be consistent '
'with your original posts.')
args = parser.parse_args() args = parser.parse_args()
@ -328,4 +338,5 @@ def main():
fields2pelican(fields, args.markup, args.output, fields2pelican(fields, args.markup, args.output,
dircat=args.dircat or False, dircat=args.dircat or False,
strip_raw=args.strip_raw or False) strip_raw=args.strip_raw or False,
disable_slugs=args.disable_slugs or False)

View file

@ -48,6 +48,26 @@ class TestWordpressXmlImporter(unittest.TestCase):
strip_raw=True)) strip_raw=True))
self.assertFalse(any('<iframe' in rst for rst in rst_files)) self.assertFalse(any('<iframe' in rst for rst in rst_files))
def test_can_toggle_slug_storage(self):
posts = list(self.posts)
r = lambda f: open(f).read()
silent_f2p = mute(True)(fields2pelican)
with temporary_folder() as temp:
rst_files = (r(f) for f in silent_f2p(posts, 'markdown', temp))
self.assertTrue(all('Slug:' in rst for rst in rst_files))
rst_files = (r(f) for f in silent_f2p(posts, 'markdown', temp,
disable_slugs=True))
self.assertFalse(any('Slug:' in rst for rst in rst_files))
rst_files = (r(f) for f in silent_f2p(posts, 'rst', temp))
self.assertTrue(all(':slug:' in rst for rst in rst_files))
rst_files = (r(f) for f in silent_f2p(posts, 'rst', temp,
disable_slugs=True))
self.assertFalse(any(':slug:' in rst for rst in rst_files))
def test_decode_html_entities_in_titles(self): def test_decode_html_entities_in_titles(self):
posts = list(self.posts) posts = list(self.posts)
test_posts = [post for post in posts if post[2] == 'html-entity-test'] test_posts = [post for post in posts if post[2] == 'html-entity-test']