From 3b37b4263344e8ec87c1ca0727eed05a4094ad14 Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 3 Aug 2011 17:04:39 +0200 Subject: [PATCH 1/3] some improvements for importers: - for dotclear: remove seconds in date, remove \\n - fix categories for wordpress (categories is a list a string) - refactor a bit the code between markdown and rst --- tools/importer.py | 63 ++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/tools/importer.py b/tools/importer.py index 612767b3..d7a5c98c 100755 --- a/tools/importer.py +++ b/tools/importer.py @@ -27,7 +27,9 @@ def wp2fields(xml): date = time.strftime("%Y-%m-%d %H:%M", date_object) author = item.fetch('dc:creator')[0].contents[0].title() - categories = [(cat['nicename'],cat.contents[0]) for cat in item.fetch(domain='category')] + + categories = [cat.contents[0] for cat in item.fetch(domain='category')] + # caturl = [cat['nicename'] for cat in item.fetch(domain='category')] tags = [tag.contents[0].title() for tag in item.fetch(domain='tag', nicename=None)] @@ -101,16 +103,22 @@ def dc2fields(file): # post_meta = fields[27] # redirect_url = fields[28][:-1] + # remove seconds + post_creadt = ':'.join(post_creadt.split(':')[0:2]) + author = "" - categories = "" + categories = [] + tags = [] + if cat_id: - categories = category_list[cat_id] - tags = "" + categories = [category_list[id].strip() for id in cat_id.split(',')] if post_format == "markdown": content = post_excerpt + post_content else: content = post_excerpt_xhtml + post_content_xhtml + content = content.replace('\\n', '') + post_format = "html" yield (post_title, content, post_url, post_creadt, author, categories, tags, post_format) @@ -135,7 +143,7 @@ def build_header(title, date, author, categories, tags): if date: header += ':date: %s\n' % date if categories: - header += ':category: %s\n' % categories + header += ':category: %s\n' % ', '.join(categories) if tags: header += ':tags: %s\n' % ', '.join(tags) header += '\n' @@ -147,7 +155,7 @@ def build_markdown_header(title, date, author, categories, tags): if date: header += 'Date: %s\n' % date if categories: - header += 'Category: %s\n' % categories + header += 'Category: %s\n' % ', '.join(categories) if tags: header += 'Tags: %s\n' % ', '.join(tags) header += '\n' @@ -156,40 +164,39 @@ def build_markdown_header(title, date, author, categories, tags): def fields2pelican(fields, output_path): for title, content, filename, date, author, categories, tags, markup in fields: if markup == "markdown": - md_filename = os.path.join(output_path, filename+'.md') + ext = '.md' header = build_markdown_header(title, date, author, categories, tags) - - # content.replace('\r\n', '\n') - - with open(md_filename, 'w', encoding='utf-8') as fp: - fp.write(header+content) - else: - filename = os.path.basename(filename) - html_filename = os.path.join(output_path, filename+'.html') + ext = '.rst' + header = build_header(title, date, author, categories, tags) - # if(len(categories) == 1): - # rst_filename = os.path.join(output_path, categories[0][0], filename+'.rst') - # if not os.path.isdir(os.path.join(output_path, categories[0][0])): - # os.mkdir(os.path.join(output_path, categories[0][0])) - # else: - rst_filename = os.path.join(output_path, filename+'.rst') + # TODO: add options to put files in directories by categories + # if(len(categories) == 1): + # out_filename = os.path.join(output_path, categories[0], filename+'.rst') + # if not os.path.isdir(os.path.join(output_path, categories[0])): + # os.mkdir(os.path.join(output_path, categories[0])) + # else: + + filename = os.path.basename(filename) + out_filename = os.path.join(output_path, filename+ext) + print out_filename + + if markup == "html": + html_filename = os.path.join(output_path, filename+'.html') with open(html_filename, 'w', encoding='utf-8') as fp: fp.write(content) - print rst_filename - os.system('pandoc --normalize --reference-links --from=html --to=rst -o %s %s' % (rst_filename, + os.system('pandoc --normalize --reference-links --from=html --to=rst -o %s %s' % (out_filename, html_filename)) os.remove(html_filename) - with open(rst_filename, 'r', encoding='utf-8') as fs: + with open(out_filename, 'r', encoding='utf-8') as fs: content = fs.read() - with open(rst_filename, 'w', encoding='utf-8') as fs: - # categories = [x[1] for x in categories] - header = build_header(title, date, author, categories, tags) - fs.write(header + content) + + with open(out_filename, 'w', encoding='utf-8') as fs: + fs.write(header + content) def main(input_type, input, output_path): From 69636a9da0368e625a2ba5aaa5dd1737e7741ef5 Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 3 Aug 2011 19:11:54 +0200 Subject: [PATCH 2/3] importer: add an option to put files in directories with category names --- tools/importer.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/tools/importer.py b/tools/importer.py index d7a5c98c..29409f17 100755 --- a/tools/importer.py +++ b/tools/importer.py @@ -161,7 +161,7 @@ def build_markdown_header(title, date, author, categories, tags): header += '\n' return header -def fields2pelican(fields, output_path): +def fields2pelican(fields, output_path, dircat=False): for title, content, filename, date, author, categories, tags, markup in fields: if markup == "markdown": ext = '.md' @@ -170,15 +170,17 @@ def fields2pelican(fields, output_path): ext = '.rst' header = build_header(title, date, author, categories, tags) - # TODO: add options to put files in directories by categories - # if(len(categories) == 1): - # out_filename = os.path.join(output_path, categories[0], filename+'.rst') - # if not os.path.isdir(os.path.join(output_path, categories[0])): - # os.mkdir(os.path.join(output_path, categories[0])) - # else: - filename = os.path.basename(filename) - out_filename = os.path.join(output_path, filename+ext) + + # option to put files in directories with categories names + if dircat and (len(categories) == 1): + catname = categories[0] + out_filename = os.path.join(output_path, catname, filename+'.rst') + if not os.path.isdir(os.path.join(output_path, catname)): + os.mkdir(os.path.join(output_path, catname)) + else: + out_filename = os.path.join(output_path, filename+ext) + print out_filename if markup == "html": @@ -187,7 +189,7 @@ def fields2pelican(fields, output_path): with open(html_filename, 'w', encoding='utf-8') as fp: fp.write(content) - os.system('pandoc --normalize --reference-links --from=html --to=rst -o %s %s' % (out_filename, + os.system('pandoc --normalize --reference-links --from=html --to=rst -o "%s" "%s"' % (out_filename, html_filename)) os.remove(html_filename) @@ -199,7 +201,7 @@ def fields2pelican(fields, output_path): fs.write(header + content) -def main(input_type, input, output_path): +def main(input_type, input, output_path, dircat=False): if input_type == 'wordpress': fields = wp2fields(input) elif input_type == 'dotclear': @@ -207,7 +209,7 @@ def main(input_type, input, output_path): elif input_type == 'feed': fields = feed2fields(input) - fields2pelican(fields, output_path) + fields2pelican(fields, output_path, dircat=dircat) if __name__ == '__main__': @@ -224,6 +226,8 @@ if __name__ == '__main__': help='feed to parse') parser.add_argument('-o', '--output', dest='output', default='output', help='Output path') + parser.add_argument('--dir-cat', action='store_true', dest='dircat', + help='Put files in directories with categories name') args = parser.parse_args() input_type = None @@ -236,4 +240,4 @@ if __name__ == '__main__': else: print "you must provide either --wpfile or --feed options" exit() - main(input_type, args.input, args.output) + main(input_type, args.input, args.output, dircat=args.dircat) From 7b7695509df31485ac799bfbe2f94b327b98d1cf Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 3 Aug 2011 22:06:10 +0200 Subject: [PATCH 3/3] importer - add documentation --- docs/importer.rst | 56 +++++++++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 7 +++--- tools/importer.py | 8 +++---- 3 files changed, 64 insertions(+), 7 deletions(-) create mode 100644 docs/importer.rst diff --git a/docs/importer.rst b/docs/importer.rst new file mode 100644 index 00000000..35cd095c --- /dev/null +++ b/docs/importer.rst @@ -0,0 +1,56 @@ +================================= + Import from other blog software +================================= + +Description +=========== + +``importer.py`` is a command line tool for converting articles from other +software to ReStructuredText. The supported formats are: + +- Wordpress XML export +- Dotclear export +- RSS/ATOM feed + +The conversion from HTML to ReStructuredText relies on `pandoc +`_. For Dotclear, if the source posts are +written with Markdown syntax, they will not be converted (as Pelican also +supports Markdown). + +Usage +""""" + +| importer.py [-h] [--wpfile] [--dotclear] [--feed] [-o OUTPUT] +| [--dir-cat] +| input + +Optional arguments: +""""""""""""""""""" + + -h, --help show this help message and exit + --wpfile Wordpress XML export + --dotclear Dotclear export + --feed Feed to parse + -o OUTPUT, --output OUTPUT + Output path + --dir-cat Put files in directories with categories name + +Examples +======== + +for Wordpress:: + + $ python2 tools/importer.py --wpfile -o ~/output ~/posts.xml + +for Dotclear:: + + $ python2 tools/importer.py --dotclear -o ~/output ~/backup.txt + + +Tests +===== + +To test the module, one can use sample files: + +- for Wordpress: http://wpcandy.com/made/the-sample-post-collection +- for Dotclear: http://themes.dotaddict.org/files/public/downloads/lorem-backup.txt diff --git a/docs/index.rst b/docs/index.rst index cbc6bb19..9aba5609 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -42,8 +42,8 @@ If you want to see new features in Pelican, dont hesitate to tell me, to clone the repository, etc. That's open source, dude! Contact me at "alexis at notmyidea dot org" for any request/feedback! You can -also join the team at `#pelican on irc.freenode.org -`_ +also join the team at `#pelican on irc.freenode.org +`_ (or if you don't have any IRC client, using `the webchat `_) for quick feedback. @@ -55,11 +55,12 @@ A french version of the documentation is available at :doc:`fr/index`. .. toctree:: :maxdepth: 2 - + getting_started settings themes internals pelican-themes + importer faq contribute diff --git a/tools/importer.py b/tools/importer.py index 29409f17..44f02fbd 100755 --- a/tools/importer.py +++ b/tools/importer.py @@ -214,8 +214,8 @@ def main(input_type, input, output_path, dircat=False): if __name__ == '__main__': parser = argparse.ArgumentParser( - description="Transform even feed or XML files to rst files." - "Be sure to have pandoc installed") + description="Transform feed, Wordpress or Dotclear files to rst files." + "Be sure to have pandoc installed") parser.add_argument(dest='input', help='The input file to read') parser.add_argument('--wpfile', action='store_true', dest='wpfile', @@ -223,7 +223,7 @@ if __name__ == '__main__': parser.add_argument('--dotclear', action='store_true', dest='dotclear', help='Dotclear export') parser.add_argument('--feed', action='store_true', dest='feed', - help='feed to parse') + help='Feed to parse') parser.add_argument('-o', '--output', dest='output', default='output', help='Output path') parser.add_argument('--dir-cat', action='store_true', dest='dircat', @@ -238,6 +238,6 @@ if __name__ == '__main__': elif args.feed: input_type = 'feed' else: - print "you must provide either --wpfile or --feed options" + print "you must provide either --wpfile, --dotclear or --feed options" exit() main(input_type, args.input, args.output, dircat=args.dircat)