Merge pull request #151 from saimn/importer

improvements for importers
This commit is contained in:
Alexis Metaireau 2011-08-04 01:21:59 -07:00
commit d6a2946aef
3 changed files with 105 additions and 37 deletions

56
docs/importer.rst Normal file
View file

@ -0,0 +1,56 @@
=================================
Import from other blog software
=================================
Description
===========
``importer.py`` is a command line tool for converting articles from other
software to ReStructuredText. The supported formats are:
- Wordpress XML export
- Dotclear export
- RSS/ATOM feed
The conversion from HTML to ReStructuredText relies on `pandoc
<http://johnmacfarlane.net/pandoc/>`_. For Dotclear, if the source posts are
written with Markdown syntax, they will not be converted (as Pelican also
supports Markdown).
Usage
"""""
| importer.py [-h] [--wpfile] [--dotclear] [--feed] [-o OUTPUT]
| [--dir-cat]
| input
Optional arguments:
"""""""""""""""""""
-h, --help show this help message and exit
--wpfile Wordpress XML export
--dotclear Dotclear export
--feed Feed to parse
-o OUTPUT, --output OUTPUT
Output path
--dir-cat Put files in directories with categories name
Examples
========
for Wordpress::
$ python2 tools/importer.py --wpfile -o ~/output ~/posts.xml
for Dotclear::
$ python2 tools/importer.py --dotclear -o ~/output ~/backup.txt
Tests
=====
To test the module, one can use sample files:
- for Wordpress: http://wpcandy.com/made/the-sample-post-collection
- for Dotclear: http://themes.dotaddict.org/files/public/downloads/lorem-backup.txt

View file

@ -42,8 +42,8 @@ If you want to see new features in Pelican, dont hesitate to tell me, to clone
the repository, etc. That's open source, dude!
Contact me at "alexis at notmyidea dot org" for any request/feedback! You can
also join the team at `#pelican on irc.freenode.org
<irc://irc.freenode.net/pelican>`_
also join the team at `#pelican on irc.freenode.org
<irc://irc.freenode.net/pelican>`_
(or if you don't have any IRC client, using `the webchat
<http://webchat.freenode.net/?channels=pelican&uio=d4>`_)
for quick feedback.
@ -55,11 +55,12 @@ A french version of the documentation is available at :doc:`fr/index`.
.. toctree::
:maxdepth: 2
getting_started
settings
themes
internals
pelican-themes
importer
faq
contribute

View file

@ -27,7 +27,9 @@ def wp2fields(xml):
date = time.strftime("%Y-%m-%d %H:%M", date_object)
author = item.fetch('dc:creator')[0].contents[0].title()
categories = [(cat['nicename'],cat.contents[0]) for cat in item.fetch(domain='category')]
categories = [cat.contents[0] for cat in item.fetch(domain='category')]
# caturl = [cat['nicename'] for cat in item.fetch(domain='category')]
tags = [tag.contents[0].title() for tag in item.fetch(domain='tag', nicename=None)]
@ -101,16 +103,22 @@ def dc2fields(file):
# post_meta = fields[27]
# redirect_url = fields[28][:-1]
# remove seconds
post_creadt = ':'.join(post_creadt.split(':')[0:2])
author = ""
categories = ""
categories = []
tags = []
if cat_id:
categories = category_list[cat_id]
tags = ""
categories = [category_list[id].strip() for id in cat_id.split(',')]
if post_format == "markdown":
content = post_excerpt + post_content
else:
content = post_excerpt_xhtml + post_content_xhtml
content = content.replace('\\n', '')
post_format = "html"
yield (post_title, content, post_url, post_creadt, author, categories, tags, post_format)
@ -135,7 +143,7 @@ def build_header(title, date, author, categories, tags):
if date:
header += ':date: %s\n' % date
if categories:
header += ':category: %s\n' % categories
header += ':category: %s\n' % ', '.join(categories)
if tags:
header += ':tags: %s\n' % ', '.join(tags)
header += '\n'
@ -147,52 +155,53 @@ def build_markdown_header(title, date, author, categories, tags):
if date:
header += 'Date: %s\n' % date
if categories:
header += 'Category: %s\n' % categories
header += 'Category: %s\n' % ', '.join(categories)
if tags:
header += 'Tags: %s\n' % ', '.join(tags)
header += '\n'
return header
def fields2pelican(fields, output_path):
def fields2pelican(fields, output_path, dircat=False):
for title, content, filename, date, author, categories, tags, markup in fields:
if markup == "markdown":
md_filename = os.path.join(output_path, filename+'.md')
ext = '.md'
header = build_markdown_header(title, date, author, categories, tags)
# content.replace('\r\n', '\n')
with open(md_filename, 'w', encoding='utf-8') as fp:
fp.write(header+content)
else:
filename = os.path.basename(filename)
html_filename = os.path.join(output_path, filename+'.html')
ext = '.rst'
header = build_header(title, date, author, categories, tags)
# if(len(categories) == 1):
# rst_filename = os.path.join(output_path, categories[0][0], filename+'.rst')
# if not os.path.isdir(os.path.join(output_path, categories[0][0])):
# os.mkdir(os.path.join(output_path, categories[0][0]))
# else:
rst_filename = os.path.join(output_path, filename+'.rst')
filename = os.path.basename(filename)
# option to put files in directories with categories names
if dircat and (len(categories) == 1):
catname = categories[0]
out_filename = os.path.join(output_path, catname, filename+'.rst')
if not os.path.isdir(os.path.join(output_path, catname)):
os.mkdir(os.path.join(output_path, catname))
else:
out_filename = os.path.join(output_path, filename+ext)
print out_filename
if markup == "html":
html_filename = os.path.join(output_path, filename+'.html')
with open(html_filename, 'w', encoding='utf-8') as fp:
fp.write(content)
print rst_filename
os.system('pandoc --normalize --reference-links --from=html --to=rst -o %s %s' % (rst_filename,
os.system('pandoc --normalize --reference-links --from=html --to=rst -o "%s" "%s"' % (out_filename,
html_filename))
os.remove(html_filename)
with open(rst_filename, 'r', encoding='utf-8') as fs:
with open(out_filename, 'r', encoding='utf-8') as fs:
content = fs.read()
with open(rst_filename, 'w', encoding='utf-8') as fs:
# categories = [x[1] for x in categories]
header = build_header(title, date, author, categories, tags)
fs.write(header + content)
with open(out_filename, 'w', encoding='utf-8') as fs:
fs.write(header + content)
def main(input_type, input, output_path):
def main(input_type, input, output_path, dircat=False):
if input_type == 'wordpress':
fields = wp2fields(input)
elif input_type == 'dotclear':
@ -200,13 +209,13 @@ def main(input_type, input, output_path):
elif input_type == 'feed':
fields = feed2fields(input)
fields2pelican(fields, output_path)
fields2pelican(fields, output_path, dircat=dircat)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Transform even feed or XML files to rst files."
"Be sure to have pandoc installed")
description="Transform feed, Wordpress or Dotclear files to rst files."
"Be sure to have pandoc installed")
parser.add_argument(dest='input', help='The input file to read')
parser.add_argument('--wpfile', action='store_true', dest='wpfile',
@ -214,9 +223,11 @@ if __name__ == '__main__':
parser.add_argument('--dotclear', action='store_true', dest='dotclear',
help='Dotclear export')
parser.add_argument('--feed', action='store_true', dest='feed',
help='feed to parse')
help='Feed to parse')
parser.add_argument('-o', '--output', dest='output', default='output',
help='Output path')
parser.add_argument('--dir-cat', action='store_true', dest='dircat',
help='Put files in directories with categories name')
args = parser.parse_args()
input_type = None
@ -227,6 +238,6 @@ if __name__ == '__main__':
elif args.feed:
input_type = 'feed'
else:
print "you must provide either --wpfile or --feed options"
print "you must provide either --wpfile, --dotclear or --feed options"
exit()
main(input_type, args.input, args.output)
main(input_type, args.input, args.output, dircat=args.dircat)