diff --git a/docs/importer.rst b/docs/importer.rst new file mode 100644 index 00000000..35cd095c --- /dev/null +++ b/docs/importer.rst @@ -0,0 +1,56 @@ +================================= + Import from other blog software +================================= + +Description +=========== + +``importer.py`` is a command line tool for converting articles from other +software to ReStructuredText. The supported formats are: + +- Wordpress XML export +- Dotclear export +- RSS/ATOM feed + +The conversion from HTML to ReStructuredText relies on `pandoc +`_. For Dotclear, if the source posts are +written with Markdown syntax, they will not be converted (as Pelican also +supports Markdown). + +Usage +""""" + +| importer.py [-h] [--wpfile] [--dotclear] [--feed] [-o OUTPUT] +| [--dir-cat] +| input + +Optional arguments: +""""""""""""""""""" + + -h, --help show this help message and exit + --wpfile Wordpress XML export + --dotclear Dotclear export + --feed Feed to parse + -o OUTPUT, --output OUTPUT + Output path + --dir-cat Put files in directories with categories name + +Examples +======== + +for Wordpress:: + + $ python2 tools/importer.py --wpfile -o ~/output ~/posts.xml + +for Dotclear:: + + $ python2 tools/importer.py --dotclear -o ~/output ~/backup.txt + + +Tests +===== + +To test the module, one can use sample files: + +- for Wordpress: http://wpcandy.com/made/the-sample-post-collection +- for Dotclear: http://themes.dotaddict.org/files/public/downloads/lorem-backup.txt diff --git a/docs/index.rst b/docs/index.rst index cbc6bb19..9aba5609 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -42,8 +42,8 @@ If you want to see new features in Pelican, dont hesitate to tell me, to clone the repository, etc. That's open source, dude! Contact me at "alexis at notmyidea dot org" for any request/feedback! You can -also join the team at `#pelican on irc.freenode.org -`_ +also join the team at `#pelican on irc.freenode.org +`_ (or if you don't have any IRC client, using `the webchat `_) for quick feedback. @@ -55,11 +55,12 @@ A french version of the documentation is available at :doc:`fr/index`. .. toctree:: :maxdepth: 2 - + getting_started settings themes internals pelican-themes + importer faq contribute diff --git a/tools/importer.py b/tools/importer.py index 612767b3..44f02fbd 100755 --- a/tools/importer.py +++ b/tools/importer.py @@ -27,7 +27,9 @@ def wp2fields(xml): date = time.strftime("%Y-%m-%d %H:%M", date_object) author = item.fetch('dc:creator')[0].contents[0].title() - categories = [(cat['nicename'],cat.contents[0]) for cat in item.fetch(domain='category')] + + categories = [cat.contents[0] for cat in item.fetch(domain='category')] + # caturl = [cat['nicename'] for cat in item.fetch(domain='category')] tags = [tag.contents[0].title() for tag in item.fetch(domain='tag', nicename=None)] @@ -101,16 +103,22 @@ def dc2fields(file): # post_meta = fields[27] # redirect_url = fields[28][:-1] + # remove seconds + post_creadt = ':'.join(post_creadt.split(':')[0:2]) + author = "" - categories = "" + categories = [] + tags = [] + if cat_id: - categories = category_list[cat_id] - tags = "" + categories = [category_list[id].strip() for id in cat_id.split(',')] if post_format == "markdown": content = post_excerpt + post_content else: content = post_excerpt_xhtml + post_content_xhtml + content = content.replace('\\n', '') + post_format = "html" yield (post_title, content, post_url, post_creadt, author, categories, tags, post_format) @@ -135,7 +143,7 @@ def build_header(title, date, author, categories, tags): if date: header += ':date: %s\n' % date if categories: - header += ':category: %s\n' % categories + header += ':category: %s\n' % ', '.join(categories) if tags: header += ':tags: %s\n' % ', '.join(tags) header += '\n' @@ -147,52 +155,53 @@ def build_markdown_header(title, date, author, categories, tags): if date: header += 'Date: %s\n' % date if categories: - header += 'Category: %s\n' % categories + header += 'Category: %s\n' % ', '.join(categories) if tags: header += 'Tags: %s\n' % ', '.join(tags) header += '\n' return header -def fields2pelican(fields, output_path): +def fields2pelican(fields, output_path, dircat=False): for title, content, filename, date, author, categories, tags, markup in fields: if markup == "markdown": - md_filename = os.path.join(output_path, filename+'.md') + ext = '.md' header = build_markdown_header(title, date, author, categories, tags) - - # content.replace('\r\n', '\n') - - with open(md_filename, 'w', encoding='utf-8') as fp: - fp.write(header+content) - else: - filename = os.path.basename(filename) - html_filename = os.path.join(output_path, filename+'.html') + ext = '.rst' + header = build_header(title, date, author, categories, tags) - # if(len(categories) == 1): - # rst_filename = os.path.join(output_path, categories[0][0], filename+'.rst') - # if not os.path.isdir(os.path.join(output_path, categories[0][0])): - # os.mkdir(os.path.join(output_path, categories[0][0])) - # else: - rst_filename = os.path.join(output_path, filename+'.rst') + filename = os.path.basename(filename) + + # option to put files in directories with categories names + if dircat and (len(categories) == 1): + catname = categories[0] + out_filename = os.path.join(output_path, catname, filename+'.rst') + if not os.path.isdir(os.path.join(output_path, catname)): + os.mkdir(os.path.join(output_path, catname)) + else: + out_filename = os.path.join(output_path, filename+ext) + + print out_filename + + if markup == "html": + html_filename = os.path.join(output_path, filename+'.html') with open(html_filename, 'w', encoding='utf-8') as fp: fp.write(content) - print rst_filename - os.system('pandoc --normalize --reference-links --from=html --to=rst -o %s %s' % (rst_filename, + os.system('pandoc --normalize --reference-links --from=html --to=rst -o "%s" "%s"' % (out_filename, html_filename)) os.remove(html_filename) - with open(rst_filename, 'r', encoding='utf-8') as fs: + with open(out_filename, 'r', encoding='utf-8') as fs: content = fs.read() - with open(rst_filename, 'w', encoding='utf-8') as fs: - # categories = [x[1] for x in categories] - header = build_header(title, date, author, categories, tags) - fs.write(header + content) + + with open(out_filename, 'w', encoding='utf-8') as fs: + fs.write(header + content) -def main(input_type, input, output_path): +def main(input_type, input, output_path, dircat=False): if input_type == 'wordpress': fields = wp2fields(input) elif input_type == 'dotclear': @@ -200,13 +209,13 @@ def main(input_type, input, output_path): elif input_type == 'feed': fields = feed2fields(input) - fields2pelican(fields, output_path) + fields2pelican(fields, output_path, dircat=dircat) if __name__ == '__main__': parser = argparse.ArgumentParser( - description="Transform even feed or XML files to rst files." - "Be sure to have pandoc installed") + description="Transform feed, Wordpress or Dotclear files to rst files." + "Be sure to have pandoc installed") parser.add_argument(dest='input', help='The input file to read') parser.add_argument('--wpfile', action='store_true', dest='wpfile', @@ -214,9 +223,11 @@ if __name__ == '__main__': parser.add_argument('--dotclear', action='store_true', dest='dotclear', help='Dotclear export') parser.add_argument('--feed', action='store_true', dest='feed', - help='feed to parse') + help='Feed to parse') parser.add_argument('-o', '--output', dest='output', default='output', help='Output path') + parser.add_argument('--dir-cat', action='store_true', dest='dircat', + help='Put files in directories with categories name') args = parser.parse_args() input_type = None @@ -227,6 +238,6 @@ if __name__ == '__main__': elif args.feed: input_type = 'feed' else: - print "you must provide either --wpfile or --feed options" + print "you must provide either --wpfile, --dotclear or --feed options" exit() - main(input_type, args.input, args.output) + main(input_type, args.input, args.output, dircat=args.dircat)